diff options
Diffstat (limited to 'arch/x86')
214 files changed, 4831 insertions, 4342 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index beea77046f9b..c1e1931f591f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -30,7 +30,6 @@ config X86_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB - select ARCH_HAS_SYSCALL_WRAPPER config FORCE_DYNAMIC_FTRACE def_bool y @@ -57,7 +56,6 @@ config X86 select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 - select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL @@ -80,6 +78,7 @@ config X86 select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE + select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -128,6 +127,7 @@ config X86 select GENERIC_GETTIMEOFDAY select GENERIC_VDSO_TIME_NS select GUP_GET_PTE_LOW_HIGH if X86_PAE + select HARDIRQS_SW_RESEND select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI @@ -240,11 +240,6 @@ config OUTPUT_FORMAT default "elf32-i386" if X86_32 default "elf64-x86-64" if X86_64 -config ARCH_DEFCONFIG - string - default "arch/x86/configs/i386_defconfig" if X86_32 - default "arch/x86/configs/x86_64_defconfig" if X86_64 - config LOCKDEP_SUPPORT def_bool y @@ -1875,7 +1870,6 @@ config X86_SMAP config X86_UMIP def_bool y - depends on CPU_SUP_INTEL || CPU_SUP_AMD prompt "User Mode Instruction Prevention" if EXPERT ---help--- User Mode Instruction Prevention (UMIP) is a security feature in @@ -2418,7 +2412,7 @@ config CMDLINE config CMDLINE_OVERRIDE bool "Built-in command line overrides boot loader arguments" - depends on CMDLINE_BOOL + depends on CMDLINE_BOOL && CMDLINE != "" ---help--- Set this option to 'Y' to have the kernel ignore the boot loader command line, and use ONLY the built-in command line. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 94df0868804b..513a55562d75 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -194,9 +194,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1) sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1) sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1) +adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr) KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 012b82fc8617..e17be90ab312 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -68,6 +68,7 @@ clean-files += cpustr.h KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables GCOV_PROFILE := n UBSAN_SANITIZE := n @@ -88,7 +89,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|efi32_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 26050ae0b27e..5f7c262bcc99 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -39,6 +39,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -87,10 +88,7 @@ endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone - -vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o \ - $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o # The compressed kernel is built with -fPIC/-fPIE so that a boot loader diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c deleted file mode 100644 index 287393d725f0..000000000000 --- a/arch/x86/boot/compressed/eboot.c +++ /dev/null @@ -1,889 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -/* ----------------------------------------------------------------------- - * - * Copyright 2011 Intel Corporation; author Matt Fleming - * - * ----------------------------------------------------------------------- */ - -#pragma GCC visibility push(hidden) - -#include <linux/efi.h> -#include <linux/pci.h> - -#include <asm/efi.h> -#include <asm/e820/types.h> -#include <asm/setup.h> -#include <asm/desc.h> -#include <asm/boot.h> - -#include "../string.h" -#include "eboot.h" - -static efi_system_table_t *sys_table; -extern const bool efi_is64; - -__pure efi_system_table_t *efi_system_table(void) -{ - return sys_table; -} - -__attribute_const__ bool efi_is_64bit(void) -{ - if (IS_ENABLED(CONFIG_EFI_MIXED)) - return efi_is64; - return IS_ENABLED(CONFIG_X86_64); -} - -static efi_status_t -preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) -{ - struct pci_setup_rom *rom = NULL; - efi_status_t status; - unsigned long size; - uint64_t romsize; - void *romimage; - - /* - * Some firmware images contain EFI function pointers at the place where - * the romimage and romsize fields are supposed to be. Typically the EFI - * code is mapped at high addresses, translating to an unrealistically - * large romsize. The UEFI spec limits the size of option ROMs to 16 - * MiB so we reject any ROMs over 16 MiB in size to catch this. - */ - romimage = efi_table_attr(pci, romimage); - romsize = efi_table_attr(pci, romsize); - if (!romimage || !romsize || romsize > SZ_16M) - return EFI_INVALID_PARAMETER; - - size = romsize + sizeof(*rom); - - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size, - (void **)&rom); - if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'rom'\n"); - return status; - } - - memset(rom, 0, sizeof(*rom)); - - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; - *__rom = rom; - - status = efi_call_proto(pci, pci.read, EfiPciIoWidthUint16, - PCI_VENDOR_ID, 1, &rom->vendor); - - if (status != EFI_SUCCESS) { - efi_printk("Failed to read rom->vendor\n"); - goto free_struct; - } - - status = efi_call_proto(pci, pci.read, EfiPciIoWidthUint16, - PCI_DEVICE_ID, 1, &rom->devid); - - if (status != EFI_SUCCESS) { - efi_printk("Failed to read rom->devid\n"); - goto free_struct; - } - - status = efi_call_proto(pci, get_location, &rom->segment, &rom->bus, - &rom->device, &rom->function); - - if (status != EFI_SUCCESS) - goto free_struct; - - memcpy(rom->romdata, romimage, romsize); - return status; - -free_struct: - efi_bs_call(free_pool, rom); - return status; -} - -/* - * There's no way to return an informative status from this function, - * because any analysis (and printing of error messages) needs to be - * done directly at the EFI function call-site. - * - * For example, EFI_INVALID_PARAMETER could indicate a bug or maybe we - * just didn't find any PCI devices, but there's no way to tell outside - * the context of the call. - */ -static void setup_efi_pci(struct boot_params *params) -{ - efi_status_t status; - void **pci_handle = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - unsigned long size = 0; - struct setup_data *data; - efi_handle_t h; - int i; - - status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL, - &pci_proto, NULL, &size, pci_handle); - - if (status == EFI_BUFFER_TOO_SMALL) { - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size, - (void **)&pci_handle); - - if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'pci_handle'\n"); - return; - } - - status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL, - &pci_proto, NULL, &size, pci_handle); - } - - if (status != EFI_SUCCESS) - goto free_handle; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - for_each_efi_handle(h, pci_handle, size, i) { - efi_pci_io_protocol_t *pci = NULL; - struct pci_setup_rom *rom; - - status = efi_bs_call(handle_protocol, h, &pci_proto, - (void **)&pci); - if (status != EFI_SUCCESS || !pci) - continue; - - status = preserve_pci_rom_image(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - } - -free_handle: - efi_bs_call(free_pool, pci_handle); -} - -static void retrieve_apple_device_properties(struct boot_params *boot_params) -{ - efi_guid_t guid = APPLE_PROPERTIES_PROTOCOL_GUID; - struct setup_data *data, *new; - efi_status_t status; - u32 size = 0; - apple_properties_protocol_t *p; - - status = efi_bs_call(locate_protocol, &guid, NULL, (void **)&p); - if (status != EFI_SUCCESS) - return; - - if (efi_table_attr(p, version) != 0x10000) { - efi_printk("Unsupported properties proto version\n"); - return; - } - - efi_call_proto(p, get_all, NULL, &size); - if (!size) - return; - - do { - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, - size + sizeof(struct setup_data), - (void **)&new); - if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'properties'\n"); - return; - } - - status = efi_call_proto(p, get_all, new->data, &size); - - if (status == EFI_BUFFER_TOO_SMALL) - efi_bs_call(free_pool, new); - } while (status == EFI_BUFFER_TOO_SMALL); - - new->type = SETUP_APPLE_PROPERTIES; - new->len = size; - new->next = 0; - - data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; - if (!data) { - boot_params->hdr.setup_data = (unsigned long)new; - } else { - while (data->next) - data = (struct setup_data *)(unsigned long)data->next; - data->next = (unsigned long)new; - } -} - -static const efi_char16_t apple[] = L"Apple"; - -static void setup_quirks(struct boot_params *boot_params) -{ - efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) - efi_table_attr(efi_system_table(), fw_vendor); - - if (!memcmp(fw_vendor, apple, sizeof(apple))) { - if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) - retrieve_apple_device_properties(boot_params); - } -} - -/* - * See if we have Universal Graphics Adapter (UGA) protocol - */ -static efi_status_t -setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size) -{ - efi_status_t status; - u32 width, height; - void **uga_handle = NULL; - efi_uga_draw_protocol_t *uga = NULL, *first_uga; - efi_handle_t handle; - int i; - - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size, - (void **)&uga_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL, - uga_proto, NULL, &size, uga_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - height = 0; - width = 0; - - first_uga = NULL; - for_each_efi_handle(handle, uga_handle, size, i) { - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 w, h, depth, refresh; - void *pciio; - - status = efi_bs_call(handle_protocol, handle, uga_proto, - (void **)&uga); - if (status != EFI_SUCCESS) - continue; - - pciio = NULL; - efi_bs_call(handle_protocol, handle, &pciio_proto, &pciio); - - status = efi_call_proto(uga, get_mode, &w, &h, &depth, &refresh); - if (status == EFI_SUCCESS && (!first_uga || pciio)) { - width = w; - height = h; - - /* - * Once we've found a UGA supporting PCIIO, - * don't bother looking any further. - */ - if (pciio) - break; - - first_uga = uga; - } - } - - if (!width && !height) - goto free_handle; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_depth = 32; - si->lfb_width = width; - si->lfb_height = height; - - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - -free_handle: - efi_bs_call(free_pool, uga_handle); - - return status; -} - -void setup_graphics(struct boot_params *boot_params) -{ - efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; - struct screen_info *si; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; - efi_status_t status; - unsigned long size; - void **gop_handle = NULL; - void **uga_handle = NULL; - - si = &boot_params->screen_info; - memset(si, 0, sizeof(*si)); - - size = 0; - status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL, - &graphics_proto, NULL, &size, gop_handle); - if (status == EFI_BUFFER_TOO_SMALL) - status = efi_setup_gop(si, &graphics_proto, size); - - if (status != EFI_SUCCESS) { - size = 0; - status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL, - &uga_proto, NULL, &size, uga_handle); - if (status == EFI_BUFFER_TOO_SMALL) - setup_uga(si, &uga_proto, size); - } -} - -void startup_32(struct boot_params *boot_params); - -void __noreturn efi_stub_entry(efi_handle_t handle, - efi_system_table_t *sys_table_arg, - struct boot_params *boot_params); - -/* - * Because the x86 boot code expects to be passed a boot_params we - * need to create one ourselves (usually the bootloader would create - * one for us). - */ -efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, - efi_system_table_t *sys_table_arg) -{ - struct boot_params *boot_params; - struct apm_bios_info *bi; - struct setup_header *hdr; - efi_loaded_image_t *image; - efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; - int options_size = 0; - efi_status_t status; - char *cmdline_ptr; - unsigned long ramdisk_addr; - unsigned long ramdisk_size; - - sys_table = sys_table_arg; - - /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - return EFI_INVALID_PARAMETER; - - status = efi_bs_call(handle_protocol, handle, &proto, (void *)&image); - if (status != EFI_SUCCESS) { - efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); - return status; - } - - status = efi_low_alloc(0x4000, 1, (unsigned long *)&boot_params); - if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate lowmem for boot params\n"); - return status; - } - - memset(boot_params, 0x0, 0x4000); - - hdr = &boot_params->hdr; - bi = &boot_params->apm_bios_info; - - /* Copy the second sector to boot_params */ - memcpy(&hdr->jump, image->image_base + 512, 512); - - /* - * Fill out some of the header fields ourselves because the - * EFI firmware loader doesn't load the first sector. - */ - hdr->root_flags = 1; - hdr->vid_mode = 0xffff; - hdr->boot_flag = 0xAA55; - - hdr->type_of_loader = 0x21; - - /* Convert unicode cmdline to ascii */ - cmdline_ptr = efi_convert_cmdline(image, &options_size); - if (!cmdline_ptr) - goto fail; - - hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; - /* Fill in upper bits of command line address, NOP on 32 bit */ - boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; - - hdr->ramdisk_image = 0; - hdr->ramdisk_size = 0; - - /* Clear APM BIOS info */ - memset(bi, 0, sizeof(*bi)); - - status = efi_parse_options(cmdline_ptr); - if (status != EFI_SUCCESS) - goto fail2; - - status = handle_cmdline_files(image, - (char *)(unsigned long)hdr->cmd_line_ptr, - "initrd=", hdr->initrd_addr_max, - &ramdisk_addr, &ramdisk_size); - - if (status != EFI_SUCCESS && - hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) { - efi_printk("Trying to load files to higher address\n"); - status = handle_cmdline_files(image, - (char *)(unsigned long)hdr->cmd_line_ptr, - "initrd=", -1UL, - &ramdisk_addr, &ramdisk_size); - } - - if (status != EFI_SUCCESS) - goto fail2; - hdr->ramdisk_image = ramdisk_addr & 0xffffffff; - hdr->ramdisk_size = ramdisk_size & 0xffffffff; - boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32; - boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; - - hdr->code32_start = (u32)(unsigned long)startup_32; - - efi_stub_entry(handle, sys_table, boot_params); - /* not reached */ - -fail2: - efi_free(options_size, hdr->cmd_line_ptr); -fail: - efi_free(0x4000, (unsigned long)boot_params); - - return status; -} - -static void add_e820ext(struct boot_params *params, - struct setup_data *e820ext, u32 nr_entries) -{ - struct setup_data *data; - - e820ext->type = SETUP_E820_EXT; - e820ext->len = nr_entries * sizeof(struct boot_e820_entry); - e820ext->next = 0; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - if (data) - data->next = (unsigned long)e820ext; - else - params->hdr.setup_data = (unsigned long)e820ext; -} - -static efi_status_t -setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size) -{ - struct boot_e820_entry *entry = params->e820_table; - struct efi_info *efi = ¶ms->efi_info; - struct boot_e820_entry *prev = NULL; - u32 nr_entries; - u32 nr_desc; - int i; - - nr_entries = 0; - nr_desc = efi->efi_memmap_size / efi->efi_memdesc_size; - - for (i = 0; i < nr_desc; i++) { - efi_memory_desc_t *d; - unsigned int e820_type = 0; - unsigned long m = efi->efi_memmap; - -#ifdef CONFIG_X86_64 - m |= (u64)efi->efi_memmap_hi << 32; -#endif - - d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i); - switch (d->type) { - case EFI_RESERVED_TYPE: - case EFI_RUNTIME_SERVICES_CODE: - case EFI_RUNTIME_SERVICES_DATA: - case EFI_MEMORY_MAPPED_IO: - case EFI_MEMORY_MAPPED_IO_PORT_SPACE: - case EFI_PAL_CODE: - e820_type = E820_TYPE_RESERVED; - break; - - case EFI_UNUSABLE_MEMORY: - e820_type = E820_TYPE_UNUSABLE; - break; - - case EFI_ACPI_RECLAIM_MEMORY: - e820_type = E820_TYPE_ACPI; - break; - - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - if (efi_soft_reserve_enabled() && - (d->attribute & EFI_MEMORY_SP)) - e820_type = E820_TYPE_SOFT_RESERVED; - else - e820_type = E820_TYPE_RAM; - break; - - case EFI_ACPI_MEMORY_NVS: - e820_type = E820_TYPE_NVS; - break; - - case EFI_PERSISTENT_MEMORY: - e820_type = E820_TYPE_PMEM; - break; - - default: - continue; - } - - /* Merge adjacent mappings */ - if (prev && prev->type == e820_type && - (prev->addr + prev->size) == d->phys_addr) { - prev->size += d->num_pages << 12; - continue; - } - - if (nr_entries == ARRAY_SIZE(params->e820_table)) { - u32 need = (nr_desc - i) * sizeof(struct e820_entry) + - sizeof(struct setup_data); - - if (!e820ext || e820ext_size < need) - return EFI_BUFFER_TOO_SMALL; - - /* boot_params map full, switch to e820 extended */ - entry = (struct boot_e820_entry *)e820ext->data; - } - - entry->addr = d->phys_addr; - entry->size = d->num_pages << PAGE_SHIFT; - entry->type = e820_type; - prev = entry++; - nr_entries++; - } - - if (nr_entries > ARRAY_SIZE(params->e820_table)) { - u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_table); - - add_e820ext(params, e820ext, nr_e820ext); - nr_entries -= nr_e820ext; - } - - params->e820_entries = (u8)nr_entries; - - return EFI_SUCCESS; -} - -static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, - u32 *e820ext_size) -{ - efi_status_t status; - unsigned long size; - - size = sizeof(struct setup_data) + - sizeof(struct e820_entry) * nr_desc; - - if (*e820ext) { - efi_bs_call(free_pool, *e820ext); - *e820ext = NULL; - *e820ext_size = 0; - } - - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size, - (void **)e820ext); - if (status == EFI_SUCCESS) - *e820ext_size = size; - - return status; -} - -static efi_status_t allocate_e820(struct boot_params *params, - struct setup_data **e820ext, - u32 *e820ext_size) -{ - unsigned long map_size, desc_size, buff_size; - struct efi_boot_memmap boot_map; - efi_memory_desc_t *map; - efi_status_t status; - __u32 nr_desc; - - boot_map.map = ↦ - boot_map.map_size = &map_size; - boot_map.desc_size = &desc_size; - boot_map.desc_ver = NULL; - boot_map.key_ptr = NULL; - boot_map.buff_size = &buff_size; - - status = efi_get_memory_map(&boot_map); - if (status != EFI_SUCCESS) - return status; - - nr_desc = buff_size / desc_size; - - if (nr_desc > ARRAY_SIZE(params->e820_table)) { - u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table); - - status = alloc_e820ext(nr_e820ext, e820ext, e820ext_size); - if (status != EFI_SUCCESS) - return status; - } - - return EFI_SUCCESS; -} - -struct exit_boot_struct { - struct boot_params *boot_params; - struct efi_info *efi; -}; - -static efi_status_t exit_boot_func(struct efi_boot_memmap *map, - void *priv) -{ - const char *signature; - struct exit_boot_struct *p = priv; - - signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE - : EFI32_LOADER_SIGNATURE; - memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - - p->efi->efi_systab = (unsigned long)efi_system_table(); - p->efi->efi_memdesc_size = *map->desc_size; - p->efi->efi_memdesc_version = *map->desc_ver; - p->efi->efi_memmap = (unsigned long)*map->map; - p->efi->efi_memmap_size = *map->map_size; - -#ifdef CONFIG_X86_64 - p->efi->efi_systab_hi = (unsigned long)efi_system_table() >> 32; - p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; -#endif - - return EFI_SUCCESS; -} - -static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) -{ - unsigned long map_sz, key, desc_size, buff_size; - efi_memory_desc_t *mem_map; - struct setup_data *e820ext = NULL; - __u32 e820ext_size = 0; - efi_status_t status; - __u32 desc_version; - struct efi_boot_memmap map; - struct exit_boot_struct priv; - - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; - priv.boot_params = boot_params; - priv.efi = &boot_params->efi_info; - - status = allocate_e820(boot_params, &e820ext, &e820ext_size); - if (status != EFI_SUCCESS) - return status; - - /* Might as well exit boot services now */ - status = efi_exit_boot_services(handle, &map, &priv, exit_boot_func); - if (status != EFI_SUCCESS) - return status; - - /* Historic? */ - boot_params->alt_mem_k = 32 * 1024; - - status = setup_e820(boot_params, e820ext, e820ext_size); - if (status != EFI_SUCCESS) - return status; - - return EFI_SUCCESS; -} - -/* - * On success we return a pointer to a boot_params structure, and NULL - * on failure. - */ -struct boot_params *efi_main(efi_handle_t handle, - efi_system_table_t *sys_table_arg, - struct boot_params *boot_params) -{ - struct desc_ptr *gdt = NULL; - struct setup_header *hdr = &boot_params->hdr; - efi_status_t status; - struct desc_struct *desc; - unsigned long cmdline_paddr; - - sys_table = sys_table_arg; - - /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - goto fail; - - /* - * make_boot_params() may have been called before efi_main(), in which - * case this is the second time we parse the cmdline. This is ok, - * parsing the cmdline multiple times does not have side-effects. - */ - cmdline_paddr = ((u64)hdr->cmd_line_ptr | - ((u64)boot_params->ext_cmd_line_ptr << 32)); - efi_parse_options((char *)cmdline_paddr); - - /* - * If the boot loader gave us a value for secure_boot then we use that, - * otherwise we ask the BIOS. - */ - if (boot_params->secure_boot == efi_secureboot_mode_unset) - boot_params->secure_boot = efi_get_secureboot(); - - /* Ask the firmware to clear memory on unclean shutdown */ - efi_enable_reset_attack_mitigation(); - - efi_random_get_seed(); - - efi_retrieve_tpm2_eventlog(); - - setup_graphics(boot_params); - - setup_efi_pci(boot_params); - - setup_quirks(boot_params); - - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), - (void **)&gdt); - if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'gdt' structure\n"); - goto fail; - } - - gdt->size = 0x800; - status = efi_low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); - if (status != EFI_SUCCESS) { - efi_printk("Failed to allocate memory for 'gdt'\n"); - goto fail; - } - - /* - * If the kernel isn't already loaded at the preferred load - * address, relocate it. - */ - if (hdr->pref_address != hdr->code32_start) { - unsigned long bzimage_addr = hdr->code32_start; - status = efi_relocate_kernel(&bzimage_addr, - hdr->init_size, hdr->init_size, - hdr->pref_address, - hdr->kernel_alignment, - LOAD_PHYSICAL_ADDR); - if (status != EFI_SUCCESS) { - efi_printk("efi_relocate_kernel() failed!\n"); - goto fail; - } - - hdr->pref_address = hdr->code32_start; - hdr->code32_start = bzimage_addr; - } - - status = exit_boot(boot_params, handle); - if (status != EFI_SUCCESS) { - efi_printk("exit_boot() failed!\n"); - goto fail; - } - - memset((char *)gdt->address, 0x0, gdt->size); - desc = (struct desc_struct *)gdt->address; - - /* The first GDT is a dummy. */ - desc++; - - if (IS_ENABLED(CONFIG_X86_64)) { - /* __KERNEL32_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; - - desc++; - } else { - /* Second entry is unused on 32-bit */ - desc++; - } - - /* __KERNEL_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - - if (IS_ENABLED(CONFIG_X86_64)) { - desc->l = 1; - desc->d = 0; - } else { - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - } - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; - desc++; - - /* __KERNEL_DS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; - desc++; - - if (IS_ENABLED(CONFIG_X86_64)) { - /* Task segment value */ - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; - desc++; - } - - asm volatile("cli"); - asm volatile ("lgdt %0" : : "m" (*gdt)); - - return boot_params; -fail: - efi_printk("efi_main() failed!\n"); - - for (;;) - asm("hlt"); -} diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h deleted file mode 100644 index 99f35343d443..000000000000 --- a/arch/x86/boot/compressed/eboot.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef BOOT_COMPRESSED_EBOOT_H -#define BOOT_COMPRESSED_EBOOT_H - -#define SEG_TYPE_DATA (0 << 3) -#define SEG_TYPE_READ_WRITE (1 << 1) -#define SEG_TYPE_CODE (1 << 3) -#define SEG_TYPE_EXEC_READ (1 << 1) -#define SEG_TYPE_TSS ((1 << 3) | (1 << 0)) -#define SEG_OP_SIZE_32BIT (1 << 0) -#define SEG_GRANULARITY_4KB (1 << 0) - -#define DESC_TYPE_CODE_DATA (1 << 0) - -typedef union efi_uga_draw_protocol efi_uga_draw_protocol_t; - -union efi_uga_draw_protocol { - struct { - efi_status_t (__efiapi *get_mode)(efi_uga_draw_protocol_t *, - u32*, u32*, u32*, u32*); - void *set_mode; - void *blt; - }; - struct { - u32 get_mode; - u32 set_mode; - u32 blt; - } mixed_mode; -}; - -#endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index 8fb7f6799c52..2b2049259619 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -54,11 +54,16 @@ SYM_FUNC_START(__efi64_thunk) * Switch to gdt with 32-bit segments. This is the firmware GDT * that was installed when the kernel started executing. This * pointer was saved at the EFI stub entry point in head_64.S. + * + * Pass the saved DS selector to the 32-bit code, and use far return to + * restore the saved CS selector. */ leaq efi32_boot_gdt(%rip), %rax lgdt (%rax) - pushq $__KERNEL_CS + movzwl efi32_boot_ds(%rip), %edx + movzwq efi32_boot_cs(%rip), %rax + pushq %rax leaq efi_enter32(%rip), %rax pushq %rax lretq @@ -73,6 +78,10 @@ SYM_FUNC_START(__efi64_thunk) movl %ebx, %es pop %rbx movl %ebx, %ds + /* Clear out 32-bit selector from FS and GS */ + xorl %ebx, %ebx + movl %ebx, %fs + movl %ebx, %gs /* * Convert 32-bit status code into 64-bit. @@ -92,10 +101,12 @@ SYM_FUNC_END(__efi64_thunk) * The stack should represent the 32-bit calling convention. */ SYM_FUNC_START_LOCAL(efi_enter32) - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss + /* Load firmware selector into data and stack segment registers */ + movl %edx, %ds + movl %edx, %es + movl %edx, %fs + movl %edx, %gs + movl %edx, %ss /* Reload pgtables */ movl %cr3, %eax @@ -157,6 +168,14 @@ SYM_DATA_START(efi32_boot_gdt) .quad 0 SYM_DATA_END(efi32_boot_gdt) +SYM_DATA_START(efi32_boot_cs) + .word 0 +SYM_DATA_END(efi32_boot_cs) + +SYM_DATA_START(efi32_boot_ds) + .word 0 +SYM_DATA_END(efi32_boot_ds) + SYM_DATA_START(efi_gdt64) .word efi_gdt64_end - efi_gdt64 .long 0 /* Filled out by user */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 73f17d0544dd..ab3307036ba4 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -63,21 +63,7 @@ __HEAD SYM_FUNC_START(startup_32) cld - /* - * Test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments - */ - testb $KEEP_SEGMENTS, BP_loadflags(%esi) - jnz 1f - cli - movl $__BOOT_DS, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %fs - movl %eax, %gs - movl %eax, %ss -1: /* * Calculate the delta between where we were compiled to run @@ -89,32 +75,59 @@ SYM_FUNC_START(startup_32) */ leal (BP_scratch+4)(%esi), %esp call 1f -1: popl %ebp - subl $1b, %ebp +1: popl %edx + subl $1b, %edx + + /* Load new GDT */ + leal gdt(%edx), %eax + movl %eax, 2(%eax) + lgdt (%eax) + + /* Load segment registers with our descriptors */ + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss /* - * %ebp contains the address we are loaded at by the boot loader and %ebx + * %edx contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily - * for safe in-place decompression. + * for safe in-place decompression. %ebp contains the address that the kernel + * will be decompressed to. */ #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + movl %edx, %ebx + +#ifdef CONFIG_EFI_STUB +/* + * If we were loaded via the EFI LoadImage service, startup_32() will be at an + * offset to the start of the space allocated for the image. efi_pe_entry() will + * set up image_offset to tell us where the image actually starts, so that we + * can use the full available buffer. + * image_offset = startup_32 - image_base + * Otherwise image_offset will be zero and has no effect on the calculations. + */ + subl image_offset(%edx), %ebx +#endif + movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx notl %eax andl %eax, %ebx cmpl $LOAD_PHYSICAL_ADDR, %ebx - jge 1f + jae 1f #endif movl $LOAD_PHYSICAL_ADDR, %ebx 1: + movl %ebx, %ebp // Save the output address for later /* Target address to relocate to for decompression */ - movl BP_init_size(%esi), %eax - subl $_end, %eax - addl %eax, %ebx + addl BP_init_size(%esi), %ebx + subl $_end, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -128,7 +141,7 @@ SYM_FUNC_START(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal (_bss-4)(%ebp), %esi + leal (_bss-4)(%edx), %esi leal (_bss-4)(%ebx), %edi movl $(_bss - startup_32), %ecx shrl $2, %ecx @@ -137,6 +150,15 @@ SYM_FUNC_START(startup_32) cld popl %esi + /* + * The GDT may get overwritten either during the copy we just did or + * during extract_kernel below. To avoid any issues, repoint the GDTR + * to the new copy of the GDT. + */ + leal gdt(%ebx), %eax + movl %eax, 2(%eax) + lgdt (%eax) + /* * Jump to the relocated address. */ @@ -148,9 +170,8 @@ SYM_FUNC_END(startup_32) SYM_FUNC_START(efi32_stub_entry) SYM_FUNC_START_ALIAS(efi_stub_entry) add $0x4, %esp + movl 8(%esp), %esi /* save boot_params pointer */ call efi_main - movl %eax, %esi - movl BP_code32_start(%esi), %eax leal startup_32(%eax), %eax jmp *%eax SYM_FUNC_END(efi32_stub_entry) @@ -189,9 +210,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* push arguments for extract_kernel: */ pushl $z_output_len /* decompressed length, end of relocs */ - leal _end(%ebx), %eax - subl BP_init_size(%esi), %eax - pushl %eax /* output address */ + pushl %ebp /* output address */ pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax @@ -209,6 +228,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) jmp *%eax SYM_FUNC_END(.Lrelocated) + .data + .balign 8 +SYM_DATA_START_LOCAL(gdt) + .word gdt_end - gdt - 1 + .long 0 + .word 0 + .quad 0x0000000000000000 /* Reserved */ + .quad 0x00cf9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) + +#ifdef CONFIG_EFI_STUB +SYM_DATA(image_offset, .long 0) +#endif + /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1f1f6c8139b3..4f7e6b84be07 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -53,19 +53,7 @@ SYM_FUNC_START(startup_32) * all need to be under the 4G limit. */ cld - /* - * Test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments - */ - testb $KEEP_SEGMENTS, BP_loadflags(%esi) - jnz 1f - cli - movl $(__BOOT_DS), %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss -1: /* * Calculate the delta between where we were compiled to run @@ -80,10 +68,21 @@ SYM_FUNC_START(startup_32) 1: popl %ebp subl $1b, %ebp + /* Load new GDT with the 64bit segments using 32bit descriptor */ + leal gdt(%ebp), %eax + movl %eax, 2(%eax) + lgdt (%eax) + + /* Load segment registers with our descriptors */ + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + /* setup a stack and make sure cpu supports long mode. */ - movl $boot_stack_end, %eax - addl %ebp, %eax - movl %eax, %esp + leal boot_stack_end(%ebp), %esp call verify_cpu testl %eax, %eax @@ -100,30 +99,38 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx + +#ifdef CONFIG_EFI_STUB +/* + * If we were loaded via the EFI LoadImage service, startup_32 will be at an + * offset to the start of the space allocated for the image. efi_pe_entry will + * set up image_offset to tell us where the image actually starts, so that we + * can use the full available buffer. + * image_offset = startup_32 - image_base + * Otherwise image_offset will be zero and has no effect on the calculations. + */ + subl image_offset(%ebp), %ebx +#endif + movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx notl %eax andl %eax, %ebx cmpl $LOAD_PHYSICAL_ADDR, %ebx - jge 1f + jae 1f #endif movl $LOAD_PHYSICAL_ADDR, %ebx 1: /* Target address to relocate to for decompression */ - movl BP_init_size(%esi), %eax - subl $_end, %eax - addl %eax, %ebx + addl BP_init_size(%esi), %ebx + subl $_end, %ebx /* * Prepare for entering 64 bit mode */ - /* Load new GDT with the 64bit segments using 32bit descriptor */ - addl %ebp, gdt+2(%ebp) - lgdt gdt(%ebp) - /* Enable PAE mode */ movl %cr4, %eax orl $X86_CR4_PAE, %eax @@ -212,8 +219,13 @@ SYM_FUNC_START(startup_32) cmp $0, %edi jz 1f leal efi64_stub_entry(%ebp), %eax - movl %esi, %edx movl efi32_boot_args+4(%ebp), %esi + movl efi32_boot_args+8(%ebp), %edx // saved bootparams pointer + cmpl $0, %edx + jnz 1f + leal efi_pe_entry(%ebp), %eax + movl %edi, %ecx // MS calling convention + movl %esi, %edx 1: #endif pushl %eax @@ -238,11 +250,17 @@ SYM_FUNC_START(efi32_stub_entry) 1: pop %ebp subl $1b, %ebp + movl %esi, efi32_boot_args+8(%ebp) +SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) movl %ecx, efi32_boot_args(%ebp) movl %edx, efi32_boot_args+4(%ebp) - sgdtl efi32_boot_gdt(%ebp) movb $0, efi_is64(%ebp) + /* Save firmware GDTR and code/data selectors */ + sgdtl efi32_boot_gdt(%ebp) + movw %cs, efi32_boot_cs(%ebp) + movw %ds, efi32_boot_ds(%ebp) + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax @@ -266,6 +284,9 @@ SYM_CODE_START(startup_64) * and command line. */ + cld + cli + /* Setup data segments. */ xorl %eax, %eax movl %eax, %ds @@ -290,13 +311,27 @@ SYM_CODE_START(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp + +#ifdef CONFIG_EFI_STUB +/* + * If we were loaded via the EFI LoadImage service, startup_32 will be at an + * offset to the start of the space allocated for the image. efi_pe_entry will + * set up image_offset to tell us where the image actually starts, so that we + * can use the full available buffer. + * image_offset = startup_32 - image_base + * Otherwise image_offset will be zero and has no effect on the calculations. + */ + movl image_offset(%rip), %eax + subq %rax, %rbp +#endif + movl BP_kernel_alignment(%rsi), %eax decl %eax addq %rax, %rbp notq %rax andq %rax, %rbp cmpq $LOAD_PHYSICAL_ADDR, %rbp - jge 1f + jae 1f #endif movq $LOAD_PHYSICAL_ADDR, %rbp 1: @@ -354,9 +389,9 @@ SYM_CODE_START(startup_64) */ /* Make sure we have GDT with 32-bit code segment */ - leaq gdt(%rip), %rax - movq %rax, gdt64+2(%rip) - lgdt gdt64(%rip) + leaq gdt64(%rip), %rax + addq %rax, 2(%rax) + lgdt (%rax) /* * paging_prepare() sets up the trampoline and checks if we need to @@ -441,6 +476,16 @@ trampoline_return: cld popq %rsi + /* + * The GDT may get overwritten either during the copy we just did or + * during extract_kernel below. To avoid any issues, repoint the GDTR + * to the new copy of the GDT. + */ + leaq gdt64(%rbx), %rax + leaq gdt(%rbx), %rdx + movq %rdx, 2(%rax) + lgdt (%rax) + /* * Jump to the relocated address. */ @@ -453,9 +498,9 @@ SYM_CODE_END(startup_64) SYM_FUNC_START(efi64_stub_entry) SYM_FUNC_START_ALIAS(efi_stub_entry) and $~0xf, %rsp /* realign the stack */ + movq %rdx, %rbx /* save boot_params pointer */ call efi_main - movq %rax,%rsi - movl BP_code32_start(%esi), %eax + movq %rbx,%rsi leaq startup_64(%rax), %rax jmp *%rax SYM_FUNC_END(efi64_stub_entry) @@ -484,7 +529,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ - movq $z_output_len, %r9 /* decompressed length, end of relocs */ + movl $z_output_len, %r9d /* decompressed length, end of relocs */ call extract_kernel /* returns kernel location in %rax */ popq %rsi @@ -613,13 +658,13 @@ SYM_FUNC_END(.Lno_longmode) .data SYM_DATA_START_LOCAL(gdt64) - .word gdt_end - gdt - .quad 0 + .word gdt_end - gdt - 1 + .quad gdt - gdt64 SYM_DATA_END(gdt64) .balign 8 SYM_DATA_START_LOCAL(gdt) - .word gdt_end - gdt - .long gdt + .word gdt_end - gdt - 1 + .long 0 .word 0 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0x00af9a000000ffff /* __KERNEL_CS */ @@ -628,9 +673,97 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x0000000000000000 /* TS continued */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) +#ifdef CONFIG_EFI_STUB +SYM_DATA(image_offset, .long 0) +#endif + #ifdef CONFIG_EFI_MIXED -SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0) +SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) + +#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) +#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) +#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) + + .text + .code32 +SYM_FUNC_START(efi32_pe_entry) +/* + * efi_status_t efi32_pe_entry(efi_handle_t image_handle, + * efi_system_table_32_t *sys_table) + */ + + pushl %ebp + movl %esp, %ebp + pushl %eax // dummy push to allocate loaded_image + + pushl %ebx // save callee-save registers + pushl %edi + + call verify_cpu // check for long mode support + testl %eax, %eax + movl $0x80000003, %eax // EFI_UNSUPPORTED + jnz 2f + + call 1f +1: pop %ebx + subl $1b, %ebx + + /* Get the loaded image protocol pointer from the image handle */ + leal -4(%ebp), %eax + pushl %eax // &loaded_image + leal loaded_image_proto(%ebx), %eax + pushl %eax // pass the GUID address + pushl 8(%ebp) // pass the image handle + + /* + * Note the alignment of the stack frame. + * sys_table + * handle <-- 16-byte aligned on entry by ABI + * return address + * frame pointer + * loaded_image <-- local variable + * saved %ebx <-- 16-byte aligned here + * saved %edi + * &loaded_image + * &loaded_image_proto + * handle <-- 16-byte aligned for call to handle_protocol + */ + + movl 12(%ebp), %eax // sys_table + movl ST32_boottime(%eax), %eax // sys_table->boottime + call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol + addl $12, %esp // restore argument space + testl %eax, %eax + jnz 2f + + movl 8(%ebp), %ecx // image_handle + movl 12(%ebp), %edx // sys_table + movl -4(%ebp), %esi // loaded_image + movl LI32_image_base(%esi), %esi // loaded_image->image_base + movl %ebx, %ebp // startup_32 for efi32_pe_stub_entry + /* + * We need to set the image_offset variable here since startup_32() will + * use it before we get to the 64-bit efi_pe_entry() in C code. + */ + subl %esi, %ebx + movl %ebx, image_offset(%ebp) // save image_offset + jmp efi32_pe_stub_entry + +2: popl %edi // restore callee-save registers + popl %ebx + leave + ret +SYM_FUNC_END(efi32_pe_entry) + + .section ".rodata" + /* EFI loaded image protocol GUID */ + .balign 4 +SYM_DATA_START_LOCAL(loaded_image_proto) + .long 0x5b1b31a1 + .word 0x9562, 0x11d2 + .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b +SYM_DATA_END(loaded_image_proto) #endif /* @@ -647,7 +780,7 @@ SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end) /* * Space for page tables (not in .bss so not zeroed) */ - .section ".pgtable","a",@nobits + .section ".pgtable","aw",@nobits .balign 4096 SYM_DATA_LOCAL(pgtable, .fill BOOT_PGT_SIZE, 1, 0) diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c index 748456c365f4..9557c5a15b91 100644 --- a/arch/x86/boot/compressed/kaslr_64.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -29,9 +29,6 @@ #define __PAGE_OFFSET __PAGE_OFFSET_BASE #include "../../mm/ident_map.c" -/* Used by pgtable.h asm code to force instruction serialization. */ -unsigned long __force_order; - /* Used to track our page table allocation area. */ struct alloc_pgt_data { unsigned char *pgt_buf; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index c8181392f70d..726e264410ff 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -59,7 +59,7 @@ void __puthex(unsigned long value); static inline void debug_putstr(const char *s) { } -static inline void debug_puthex(const char *s) +static inline void debug_puthex(unsigned long value) { } #define debug_putaddr(x) /* */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 97d9b6d6c1af..735ad7f21ab0 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -15,7 +15,7 @@ * hex while segment addresses are written as segment:offset. * */ - +#include <linux/pe.h> #include <asm/segment.h> #include <asm/boot.h> #include <asm/page_types.h> @@ -43,8 +43,7 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ bootsect_start: #ifdef CONFIG_EFI_STUB # "MZ", MS-DOS header - .byte 0x4d - .byte 0x5a + .word MZ_MAGIC #endif # Normalize the start address @@ -97,39 +96,30 @@ bugger_off_msg: #ifdef CONFIG_EFI_STUB pe_header: - .ascii "PE" - .word 0 + .long PE_MAGIC coff_header: #ifdef CONFIG_X86_32 - .word 0x14c # i386 + .set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE + .set pe_opt_magic, PE_OPT_MAGIC_PE32 + .word IMAGE_FILE_MACHINE_I386 #else - .word 0x8664 # x86-64 + .set image_file_add_flags, 0 + .set pe_opt_magic, PE_OPT_MAGIC_PE32PLUS + .word IMAGE_FILE_MACHINE_AMD64 #endif - .word 4 # nr_sections + .word section_count # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols .word section_table - optional_header # SizeOfOptionalHeader -#ifdef CONFIG_X86_32 - .word 0x306 # Characteristics. - # IMAGE_FILE_32BIT_MACHINE | - # IMAGE_FILE_DEBUG_STRIPPED | - # IMAGE_FILE_EXECUTABLE_IMAGE | - # IMAGE_FILE_LINE_NUMS_STRIPPED -#else - .word 0x206 # Characteristics - # IMAGE_FILE_DEBUG_STRIPPED | - # IMAGE_FILE_EXECUTABLE_IMAGE | - # IMAGE_FILE_LINE_NUMS_STRIPPED -#endif + .word IMAGE_FILE_EXECUTABLE_IMAGE | \ + image_file_add_flags | \ + IMAGE_FILE_DEBUG_STRIPPED | \ + IMAGE_FILE_LINE_NUMS_STRIPPED # Characteristics optional_header: -#ifdef CONFIG_X86_32 - .word 0x10b # PE32 format -#else - .word 0x20b # PE32+ format -#endif + .word pe_opt_magic .byte 0x02 # MajorLinkerVersion .byte 0x14 # MinorLinkerVersion @@ -148,17 +138,19 @@ optional_header: #endif extra_header_fields: + # PE specification requires ImageBase to be 64k aligned + .set image_base, (LOAD_PHYSICAL_ADDR + 0xffff) & ~0xffff #ifdef CONFIG_X86_32 - .long 0 # ImageBase + .long image_base # ImageBase #else - .quad 0 # ImageBase + .quad image_base # ImageBase #endif .long 0x20 # SectionAlignment .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion - .word 0 # MajorImageVersion - .word 0 # MinorImageVersion + .word LINUX_EFISTUB_MAJOR_VERSION # MajorImageVersion + .word LINUX_EFISTUB_MINOR_VERSION # MinorImageVersion .word 0 # MajorSubsystemVersion .word 0 # MinorSubsystemVersion .long 0 # Win32VersionValue @@ -170,7 +162,7 @@ extra_header_fields: .long 0x200 # SizeOfHeaders .long 0 # CheckSum - .word 0xa # Subsystem (EFI application) + .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) .word 0 # DllCharacteristics #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve @@ -184,7 +176,7 @@ extra_header_fields: .quad 0 # SizeOfHeapCommit #endif .long 0 # LoaderFlags - .long 0x6 # NumberOfRvaAndSizes + .long (section_table - .) / 8 # NumberOfRvaAndSizes .quad 0 # ExportTable .quad 0 # ImportTable @@ -210,7 +202,10 @@ section_table: .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers - .long 0x60500020 # Characteristics (section flags) + .long IMAGE_SCN_CNT_CODE | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_EXECUTE | \ + IMAGE_SCN_ALIGN_16BYTES # Characteristics # # The EFI application loader requires a relocation section @@ -228,45 +223,53 @@ section_table: .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers - .long 0x42100040 # Characteristics (section flags) + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_DISCARDABLE | \ + IMAGE_SCN_ALIGN_1BYTES # Characteristics +#ifdef CONFIG_EFI_MIXED # # The offset & size fields are filled in by build.c. # - .ascii ".text" - .byte 0 - .byte 0 - .byte 0 + .asciz ".compat" .long 0 - .long 0x0 # startup_{32,64} + .long 0x0 .long 0 # Size of initialized data # on disk - .long 0x0 # startup_{32,64} + .long 0x0 .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers - .long 0x60500020 # Characteristics (section flags) + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_DISCARDABLE | \ + IMAGE_SCN_ALIGN_1BYTES # Characteristics +#endif # # The offset & size fields are filled in by build.c. # - .ascii ".bss" - .byte 0 + .ascii ".text" .byte 0 .byte 0 .byte 0 .long 0 - .long 0x0 + .long 0x0 # startup_{32,64} .long 0 # Size of initialized data # on disk - .long 0x0 + .long 0x0 # startup_{32,64} .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers - .long 0xc8000080 # Characteristics (section flags) + .long IMAGE_SCN_CNT_CODE | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_EXECUTE | \ + IMAGE_SCN_ALIGN_16BYTES # Characteristics + .set section_count, (. - section_table) / 40 #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 3da1c37c6dd5..24c95522f231 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -52,7 +52,6 @@ SECTIONS _end = .; /DISCARD/ : { - *(.eh_frame) *(.note*) } diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 55e669d29e54..8f8c8e386cea 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -53,11 +53,20 @@ u8 buf[SETUP_SECT_MAX*512]; #define PECOFF_RELOC_RESERVE 0x20 +#ifdef CONFIG_EFI_MIXED +#define PECOFF_COMPAT_RESERVE 0x20 +#else +#define PECOFF_COMPAT_RESERVE 0x0 +#endif + unsigned long efi32_stub_entry; unsigned long efi64_stub_entry; unsigned long efi_pe_entry; +unsigned long efi32_pe_entry; unsigned long kernel_info; unsigned long startup_64; +unsigned long _ehead; +unsigned long _end; /*----------------------------------------------------------------------*/ @@ -189,7 +198,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz static void update_pecoff_setup_and_reloc(unsigned int size) { u32 setup_offset = 0x200; - u32 reloc_offset = size - PECOFF_RELOC_RESERVE; + u32 reloc_offset = size - PECOFF_RELOC_RESERVE - PECOFF_COMPAT_RESERVE; +#ifdef CONFIG_EFI_MIXED + u32 compat_offset = reloc_offset + PECOFF_RELOC_RESERVE; +#endif u32 setup_size = reloc_offset - setup_offset; update_pecoff_section_header(".setup", setup_offset, setup_size); @@ -201,43 +213,59 @@ static void update_pecoff_setup_and_reloc(unsigned int size) */ put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); put_unaligned_le32(10, &buf[reloc_offset + 4]); + +#ifdef CONFIG_EFI_MIXED + update_pecoff_section_header(".compat", compat_offset, PECOFF_COMPAT_RESERVE); + + /* + * Put the IA-32 machine type (0x14c) and the associated entry point + * address in the .compat section, so loaders can figure out which other + * execution modes this image supports. + */ + buf[compat_offset] = 0x1; + buf[compat_offset + 1] = 0x8; + put_unaligned_le16(0x14c, &buf[compat_offset + 2]); + put_unaligned_le32(efi32_pe_entry + size, &buf[compat_offset + 4]); +#endif } -static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) +static void update_pecoff_text(unsigned int text_start, unsigned int file_sz, + unsigned int init_sz) { unsigned int pe_header; unsigned int text_sz = file_sz - text_start; + unsigned int bss_sz = init_sz - file_sz; pe_header = get_unaligned_le32(&buf[0x3c]); /* + * The PE/COFF loader may load the image at an address which is + * misaligned with respect to the kernel_alignment field in the setup + * header. + * + * In order to avoid relocating the kernel to correct the misalignment, + * add slack to allow the buffer to be aligned within the declared size + * of the image. + */ + bss_sz += CONFIG_PHYSICAL_ALIGN; + init_sz += CONFIG_PHYSICAL_ALIGN; + + /* * Size of code: Subtract the size of the first sector (512 bytes) * which includes the header. */ - put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]); + put_unaligned_le32(file_sz - 512 + bss_sz, &buf[pe_header + 0x1c]); + + /* Size of image */ + put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); /* * Address of entry point for PE/COFF executable */ put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]); - update_pecoff_section_header(".text", text_start, text_sz); -} - -static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz) -{ - unsigned int pe_header; - unsigned int bss_sz = init_sz - file_sz; - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* Size of uninitialized data */ - put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]); - - /* Size of image */ - put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); - - update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0); + update_pecoff_section_header_fields(".text", text_start, text_sz + bss_sz, + text_sz, text_start); } static int reserve_pecoff_reloc_section(int c) @@ -278,9 +306,8 @@ static void efi_stub_entry_update(void) static inline void update_pecoff_setup_and_reloc(unsigned int size) {} static inline void update_pecoff_text(unsigned int text_start, - unsigned int file_sz) {} -static inline void update_pecoff_bss(unsigned int file_sz, - unsigned int init_sz) {} + unsigned int file_sz, + unsigned int init_sz) {} static inline void efi_stub_defaults(void) {} static inline void efi_stub_entry_update(void) {} @@ -290,6 +317,12 @@ static inline int reserve_pecoff_reloc_section(int c) } #endif /* CONFIG_EFI_STUB */ +static int reserve_pecoff_compat_section(int c) +{ + /* Reserve 0x20 bytes for .compat section */ + memset(buf+c, 0, PECOFF_COMPAT_RESERVE); + return PECOFF_COMPAT_RESERVE; +} /* * Parse zoffset.h and find the entry points. We could just #include zoffset.h @@ -322,8 +355,11 @@ static void parse_zoffset(char *fname) PARSE_ZOFS(p, efi32_stub_entry); PARSE_ZOFS(p, efi64_stub_entry); PARSE_ZOFS(p, efi_pe_entry); + PARSE_ZOFS(p, efi32_pe_entry); PARSE_ZOFS(p, kernel_info); PARSE_ZOFS(p, startup_64); + PARSE_ZOFS(p, _ehead); + PARSE_ZOFS(p, _end); p = strchr(p, '\n'); while (p && (*p == '\r' || *p == '\n')) @@ -365,6 +401,7 @@ int main(int argc, char ** argv) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); + c += reserve_pecoff_compat_section(c); c += reserve_pecoff_reloc_section(c); /* Pad unused space with zeros */ @@ -406,9 +443,28 @@ int main(int argc, char ** argv) buf[0x1f1] = setup_sectors-1; put_unaligned_le32(sys_size, &buf[0x1f4]); - update_pecoff_text(setup_sectors * 512, i + (sys_size * 16)); init_sz = get_unaligned_le32(&buf[0x260]); - update_pecoff_bss(i + (sys_size * 16), init_sz); +#ifdef CONFIG_EFI_STUB + /* + * The decompression buffer will start at ImageBase. When relocating + * the compressed kernel to its end, we must ensure that the head + * section does not get overwritten. The head section occupies + * [i, i + _ehead), and the destination is [init_sz - _end, init_sz). + * + * At present these should never overlap, because 'i' is at most 32k + * because of SETUP_SECT_MAX, '_ehead' is less than 1k, and the + * calculation of INIT_SIZE in boot/header.S ensures that + * 'init_sz - _end' is at least 64k. + * + * For future-proofing, increase init_sz if necessary. + */ + + if (init_sz - _end < i + _ehead) { + init_sz = (i + _ehead + _end + 4095) & ~4095; + put_unaligned_le32(init_sz, &buf[0x260]); + } +#endif + update_pecoff_text(setup_sectors * 512, i + (sys_size * 16), init_sz); efi_stub_entry_update(); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 59ce9ed58430..5b602beb0b72 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -125,7 +125,6 @@ CONFIG_IP6_NF_MANGLE=y CONFIG_NET_SCHED=y CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y -CONFIG_HAMRADIO=y CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y @@ -171,7 +170,6 @@ CONFIG_FORCEDETH=y CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set CONFIG_R8169=y -CONFIG_FDDI=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 0b9654c7a05c..f3d1f36103b1 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -123,7 +123,6 @@ CONFIG_IP6_NF_MANGLE=y CONFIG_NET_SCHED=y CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y -CONFIG_HAMRADIO=y CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y @@ -164,7 +163,6 @@ CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y CONFIG_R8169=y -CONFIG_FDDI=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b69e00bf20b8..8c2e9eadee8a 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -11,6 +11,7 @@ avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no) sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) +adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no) obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -39,7 +40,11 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o -obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o + +# These modules require the assembler to support ADX. +ifeq ($(adx_supported),yes) + obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o +endif # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index bbbebbd35b5d..75b6ea20491e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1064,7 +1064,7 @@ static struct aead_alg aesni_aeads[0]; static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 418bd88acac8..7c4c7b2fbf05 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -170,7 +170,7 @@ static struct shash_alg alg = { }; static const struct x86_cpu_id crc32pclmul_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index c20d1b8a82c3..d2d069bd459b 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -221,7 +221,7 @@ static struct shash_alg alg = { }; static const struct x86_cpu_id crc32c_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM4_2), + X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 3c81e15b0873..71291d5af9f4 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -114,7 +114,7 @@ static struct shash_alg alg = { }; static const struct x86_cpu_id crct10dif_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index a4b728518e28..1f1a95f3dd0c 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -313,7 +313,7 @@ static struct ahash_alg ghash_async_alg = { }; static const struct x86_cpu_id pcmul_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */ + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */ {} }; MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 06fc70cf5433..85eb381259c2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -14,4 +14,5 @@ obj-y += vdso/ obj-y += vsyscall/ obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o +obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9747876980b5..76735ec813e6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -34,6 +34,7 @@ #include <asm/fpu/api.h> #include <asm/nospec-branch.h> #include <asm/io_bitmap.h> +#include <asm/syscall.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> @@ -333,20 +334,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) if (likely(nr < IA32_NR_syscalls)) { nr = array_index_nospec(nr, IA32_NR_syscalls); -#ifdef CONFIG_IA32_EMULATION regs->ax = ia32_sys_call_table[nr](regs); -#else - /* - * It's possible that a 32-bit syscall implementation - * takes a 64-bit parameter but nonetheless assumes that - * the high bits are zero. Make sure we zero-extend all - * of the args. - */ - regs->ax = ia32_sys_call_table[nr]( - (unsigned int)regs->bx, (unsigned int)regs->cx, - (unsigned int)regs->dx, (unsigned int)regs->si, - (unsigned int)regs->di, (unsigned int)regs->bp); -#endif /* CONFIG_IA32_EMULATION */ } syscall_return_slowpath(regs); @@ -438,3 +426,8 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) #endif } #endif + +SYSCALL_DEFINE0(ni_syscall) +{ + return -ENOSYS; +} diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7e0560442538..b67bae7091d7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1088,10 +1088,10 @@ SYM_FUNC_START(entry_INT80_32) STACKLEAK_ERASE restore_all: - TRACE_IRQS_IRET + TRACE_IRQS_ON SWITCH_TO_ENTRY_STACK CHECK_AND_APPLY_ESPFIX -.Lrestore_nocheck: + /* Switch back to user CR3 */ SWITCH_TO_USER_CR3 scratch_reg=%eax @@ -1290,7 +1290,7 @@ SYM_CODE_END(simd_coprocessor_error) SYM_CODE_START(device_not_available) ASM_CLAC - pushl $-1 # mark this as an int + pushl $0 pushl $do_device_not_available jmp common_exception SYM_CODE_END(device_not_available) @@ -1365,7 +1365,7 @@ SYM_CODE_END(divide_error) SYM_CODE_START(machine_check) ASM_CLAC pushl $0 - pushl machine_check_vector + pushl $do_mce jmp common_exception SYM_CODE_END(machine_check) #endif @@ -1531,7 +1531,7 @@ SYM_CODE_START(debug) * Entry from sysenter is now handled in common_exception */ ASM_CLAC - pushl $-1 # mark this as an int + pushl $0 pushl $do_debug jmp common_exception SYM_CODE_END(debug) @@ -1682,18 +1682,13 @@ SYM_CODE_END(nmi) SYM_CODE_START(int3) ASM_CLAC - pushl $-1 # mark this as an int - - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - xorl %edx, %edx # zero error code - movl %esp, %eax # pt_regs pointer - call do_int3 - jmp ret_from_exception + pushl $0 + pushl $do_int3 + jmp common_exception SYM_CODE_END(int3) SYM_CODE_START(general_protection) + ASM_CLAC pushl $do_general_protection jmp common_exception SYM_CODE_END(general_protection) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f2bb91e87877..0e9504fabe52 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -174,7 +174,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ - TRACE_IRQS_IRETQ /* we're about to change IF */ + TRACE_IRQS_ON /* return enables interrupts */ /* * Try to use SYSRET instead of IRET if we're returning to @@ -619,7 +619,7 @@ ret_from_intr: .Lretint_user: mov %rsp,%rdi call prepare_exit_to_usermode - TRACE_IRQS_IRETQ + TRACE_IRQS_ON SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 7d17b3addbbb..86eb0d89d46f 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -4,29 +4,22 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> -#include <asm/asm-offsets.h> +#include <linux/syscalls.h> +#include <asm/unistd.h> #include <asm/syscall.h> -#ifdef CONFIG_IA32_EMULATION -/* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ -#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); -#define __sys_ni_syscall __ia32_sys_ni_syscall -#else /* CONFIG_IA32_EMULATION */ -#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -#define __sys_ni_syscall sys_ni_syscall -#endif /* CONFIG_IA32_EMULATION */ +#define __SYSCALL_I386(nr, sym) extern long __ia32_##sym(const struct pt_regs *); #include <asm/syscalls_32.h> #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, qual) [nr] = sym, +#define __SYSCALL_I386(nr, sym) [nr] = __ia32_##sym, -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_compat_max] = &__sys_ni_syscall, + [0 ... __NR_ia32_syscall_max] = &__ia32_sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index adf619a856e8..1594ec72bcbb 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -5,24 +5,17 @@ #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> -#include <asm/asm-offsets.h> +#include <asm/unistd.h> #include <asm/syscall.h> -extern asmlinkage long sys_ni_syscall(void); +#define __SYSCALL_X32(nr, sym) +#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) -SYSCALL_DEFINE0(ni_syscall) -{ - return sys_ni_syscall(); -} - -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); -#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_64(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include <asm/syscalls_64.h> #undef __SYSCALL_64 -#undef __SYSCALL_X32 -#define __SYSCALL_64(nr, sym, qual) [nr] = sym, -#define __SYSCALL_X32(nr, sym, qual) +#define __SYSCALL_64(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* @@ -32,25 +25,3 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, #include <asm/syscalls_64.h> }; - -#undef __SYSCALL_64 -#undef __SYSCALL_X32 - -#ifdef CONFIG_X86_X32_ABI - -#define __SYSCALL_64(nr, sym, qual) -#define __SYSCALL_X32(nr, sym, qual) [nr] = sym, - -asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_x32_max] = &__x64_sys_ni_syscall, -#include <asm/syscalls_64.h> -}; - -#undef __SYSCALL_64 -#undef __SYSCALL_X32 - -#endif diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c new file mode 100644 index 000000000000..3d8d70d3896c --- /dev/null +++ b/arch/x86/entry/syscall_x32.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* System call table for x32 ABI. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <linux/syscalls.h> +#include <asm/unistd.h> +#include <asm/syscall.h> + +#define __SYSCALL_64(nr, sym) + +#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); +#define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#include <asm/syscalls_64.h> +#undef __SYSCALL_X32 +#undef __SYSCALL_COMMON + +#define __SYSCALL_X32(nr, sym) [nr] = __x32_##sym, +#define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym, + +asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall, +#include <asm/syscalls_64.h> +}; diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c17cb77eb150..54581ac671b4 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -11,434 +11,434 @@ # # The abi is always "i386" for this file. # -0 i386 restart_syscall sys_restart_syscall __ia32_sys_restart_syscall -1 i386 exit sys_exit __ia32_sys_exit -2 i386 fork sys_fork __ia32_sys_fork -3 i386 read sys_read __ia32_sys_read -4 i386 write sys_write __ia32_sys_write -5 i386 open sys_open __ia32_compat_sys_open -6 i386 close sys_close __ia32_sys_close -7 i386 waitpid sys_waitpid __ia32_sys_waitpid -8 i386 creat sys_creat __ia32_sys_creat -9 i386 link sys_link __ia32_sys_link -10 i386 unlink sys_unlink __ia32_sys_unlink -11 i386 execve sys_execve __ia32_compat_sys_execve -12 i386 chdir sys_chdir __ia32_sys_chdir -13 i386 time sys_time32 __ia32_sys_time32 -14 i386 mknod sys_mknod __ia32_sys_mknod -15 i386 chmod sys_chmod __ia32_sys_chmod -16 i386 lchown sys_lchown16 __ia32_sys_lchown16 +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit +2 i386 fork sys_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve sys_execve compat_sys_execve +12 i386 chdir sys_chdir +13 i386 time sys_time32 +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 17 i386 break -18 i386 oldstat sys_stat __ia32_sys_stat -19 i386 lseek sys_lseek __ia32_compat_sys_lseek -20 i386 getpid sys_getpid __ia32_sys_getpid -21 i386 mount sys_mount __ia32_compat_sys_mount -22 i386 umount sys_oldumount __ia32_sys_oldumount -23 i386 setuid sys_setuid16 __ia32_sys_setuid16 -24 i386 getuid sys_getuid16 __ia32_sys_getuid16 -25 i386 stime sys_stime32 __ia32_sys_stime32 -26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace -27 i386 alarm sys_alarm __ia32_sys_alarm -28 i386 oldfstat sys_fstat __ia32_sys_fstat -29 i386 pause sys_pause __ia32_sys_pause -30 i386 utime sys_utime32 __ia32_sys_utime32 +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek compat_sys_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount compat_sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime32 +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime32 31 i386 stty 32 i386 gtty -33 i386 access sys_access __ia32_sys_access -34 i386 nice sys_nice __ia32_sys_nice +33 i386 access sys_access +34 i386 nice sys_nice 35 i386 ftime -36 i386 sync sys_sync __ia32_sys_sync -37 i386 kill sys_kill __ia32_sys_kill -38 i386 rename sys_rename __ia32_sys_rename -39 i386 mkdir sys_mkdir __ia32_sys_mkdir -40 i386 rmdir sys_rmdir __ia32_sys_rmdir -41 i386 dup sys_dup __ia32_sys_dup -42 i386 pipe sys_pipe __ia32_sys_pipe -43 i386 times sys_times __ia32_compat_sys_times +36 i386 sync sys_sync +37 i386 kill sys_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times 44 i386 prof -45 i386 brk sys_brk __ia32_sys_brk -46 i386 setgid sys_setgid16 __ia32_sys_setgid16 -47 i386 getgid sys_getgid16 __ia32_sys_getgid16 -48 i386 signal sys_signal __ia32_sys_signal -49 i386 geteuid sys_geteuid16 __ia32_sys_geteuid16 -50 i386 getegid sys_getegid16 __ia32_sys_getegid16 -51 i386 acct sys_acct __ia32_sys_acct -52 i386 umount2 sys_umount __ia32_sys_umount +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount 53 i386 lock -54 i386 ioctl sys_ioctl __ia32_compat_sys_ioctl -55 i386 fcntl sys_fcntl __ia32_compat_sys_fcntl64 +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 56 i386 mpx -57 i386 setpgid sys_setpgid __ia32_sys_setpgid +57 i386 setpgid sys_setpgid 58 i386 ulimit -59 i386 oldolduname sys_olduname __ia32_sys_olduname -60 i386 umask sys_umask __ia32_sys_umask -61 i386 chroot sys_chroot __ia32_sys_chroot -62 i386 ustat sys_ustat __ia32_compat_sys_ustat -63 i386 dup2 sys_dup2 __ia32_sys_dup2 -64 i386 getppid sys_getppid __ia32_sys_getppid -65 i386 getpgrp sys_getpgrp __ia32_sys_getpgrp -66 i386 setsid sys_setsid __ia32_sys_setsid -67 i386 sigaction sys_sigaction __ia32_compat_sys_sigaction -68 i386 sgetmask sys_sgetmask __ia32_sys_sgetmask -69 i386 ssetmask sys_ssetmask __ia32_sys_ssetmask -70 i386 setreuid sys_setreuid16 __ia32_sys_setreuid16 -71 i386 setregid sys_setregid16 __ia32_sys_setregid16 -72 i386 sigsuspend sys_sigsuspend __ia32_sys_sigsuspend -73 i386 sigpending sys_sigpending __ia32_compat_sys_sigpending -74 i386 sethostname sys_sethostname __ia32_sys_sethostname -75 i386 setrlimit sys_setrlimit __ia32_compat_sys_setrlimit -76 i386 getrlimit sys_old_getrlimit __ia32_compat_sys_old_getrlimit -77 i386 getrusage sys_getrusage __ia32_compat_sys_getrusage -78 i386 gettimeofday sys_gettimeofday __ia32_compat_sys_gettimeofday -79 i386 settimeofday sys_settimeofday __ia32_compat_sys_settimeofday -80 i386 getgroups sys_getgroups16 __ia32_sys_getgroups16 -81 i386 setgroups sys_setgroups16 __ia32_sys_setgroups16 -82 i386 select sys_old_select __ia32_compat_sys_old_select -83 i386 symlink sys_symlink __ia32_sys_symlink -84 i386 oldlstat sys_lstat __ia32_sys_lstat -85 i386 readlink sys_readlink __ia32_sys_readlink -86 i386 uselib sys_uselib __ia32_sys_uselib -87 i386 swapon sys_swapon __ia32_sys_swapon -88 i386 reboot sys_reboot __ia32_sys_reboot -89 i386 readdir sys_old_readdir __ia32_compat_sys_old_readdir -90 i386 mmap sys_old_mmap __ia32_compat_sys_x86_mmap -91 i386 munmap sys_munmap __ia32_sys_munmap -92 i386 truncate sys_truncate __ia32_compat_sys_truncate -93 i386 ftruncate sys_ftruncate __ia32_compat_sys_ftruncate -94 i386 fchmod sys_fchmod __ia32_sys_fchmod -95 i386 fchown sys_fchown16 __ia32_sys_fchown16 -96 i386 getpriority sys_getpriority __ia32_sys_getpriority -97 i386 setpriority sys_setpriority __ia32_sys_setpriority +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction compat_sys_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap compat_sys_ia32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate compat_sys_truncate +93 i386 ftruncate sys_ftruncate compat_sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority 98 i386 profil -99 i386 statfs sys_statfs __ia32_compat_sys_statfs -100 i386 fstatfs sys_fstatfs __ia32_compat_sys_fstatfs -101 i386 ioperm sys_ioperm __ia32_sys_ioperm -102 i386 socketcall sys_socketcall __ia32_compat_sys_socketcall -103 i386 syslog sys_syslog __ia32_sys_syslog -104 i386 setitimer sys_setitimer __ia32_compat_sys_setitimer -105 i386 getitimer sys_getitimer __ia32_compat_sys_getitimer -106 i386 stat sys_newstat __ia32_compat_sys_newstat -107 i386 lstat sys_newlstat __ia32_compat_sys_newlstat -108 i386 fstat sys_newfstat __ia32_compat_sys_newfstat -109 i386 olduname sys_uname __ia32_sys_uname -110 i386 iopl sys_iopl __ia32_sys_iopl -111 i386 vhangup sys_vhangup __ia32_sys_vhangup +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl sys_iopl +111 i386 vhangup sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old __ia32_sys_ni_syscall -114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 -115 i386 swapoff sys_swapoff __ia32_sys_swapoff -116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo -117 i386 ipc sys_ipc __ia32_compat_sys_ipc -118 i386 fsync sys_fsync __ia32_sys_fsync -119 i386 sigreturn sys_sigreturn __ia32_compat_sys_sigreturn -120 i386 clone sys_clone __ia32_compat_sys_x86_clone -121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname -122 i386 uname sys_newuname __ia32_sys_newuname -123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt -124 i386 adjtimex sys_adjtimex_time32 __ia32_sys_adjtimex_time32 -125 i386 mprotect sys_mprotect __ia32_sys_mprotect -126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask +113 i386 vm86old sys_vm86old sys_ni_syscall +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc compat_sys_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn sys_sigreturn compat_sys_sigreturn +120 i386 clone sys_clone compat_sys_ia32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex_time32 +125 i386 mprotect sys_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask 127 i386 create_module -128 i386 init_module sys_init_module __ia32_sys_init_module -129 i386 delete_module sys_delete_module __ia32_sys_delete_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module 130 i386 get_kernel_syms -131 i386 quotactl sys_quotactl __ia32_compat_sys_quotactl32 -132 i386 getpgid sys_getpgid __ia32_sys_getpgid -133 i386 fchdir sys_fchdir __ia32_sys_fchdir -134 i386 bdflush sys_bdflush __ia32_sys_bdflush -135 i386 sysfs sys_sysfs __ia32_sys_sysfs -136 i386 personality sys_personality __ia32_sys_personality +131 i386 quotactl sys_quotactl compat_sys_quotactl32 +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_bdflush +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality 137 i386 afs_syscall -138 i386 setfsuid sys_setfsuid16 __ia32_sys_setfsuid16 -139 i386 setfsgid sys_setfsgid16 __ia32_sys_setfsgid16 -140 i386 _llseek sys_llseek __ia32_sys_llseek -141 i386 getdents sys_getdents __ia32_compat_sys_getdents -142 i386 _newselect sys_select __ia32_compat_sys_select -143 i386 flock sys_flock __ia32_sys_flock -144 i386 msync sys_msync __ia32_sys_msync -145 i386 readv sys_readv __ia32_compat_sys_readv -146 i386 writev sys_writev __ia32_compat_sys_writev -147 i386 getsid sys_getsid __ia32_sys_getsid -148 i386 fdatasync sys_fdatasync __ia32_sys_fdatasync -149 i386 _sysctl sys_sysctl __ia32_compat_sys_sysctl -150 i386 mlock sys_mlock __ia32_sys_mlock -151 i386 munlock sys_munlock __ia32_sys_munlock -152 i386 mlockall sys_mlockall __ia32_sys_mlockall -153 i386 munlockall sys_munlockall __ia32_sys_munlockall -154 i386 sched_setparam sys_sched_setparam __ia32_sys_sched_setparam -155 i386 sched_getparam sys_sched_getparam __ia32_sys_sched_getparam -156 i386 sched_setscheduler sys_sched_setscheduler __ia32_sys_sched_setscheduler -157 i386 sched_getscheduler sys_sched_getscheduler __ia32_sys_sched_getscheduler -158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield -159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max -160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32 __ia32_sys_sched_rr_get_interval_time32 -162 i386 nanosleep sys_nanosleep_time32 __ia32_sys_nanosleep_time32 -163 i386 mremap sys_mremap __ia32_sys_mremap -164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 -165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 -166 i386 vm86 sys_vm86 __ia32_sys_ni_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv compat_sys_readv +146 i386 writev sys_writev compat_sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_sysctl compat_sys_sysctl +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 i386 nanosleep sys_nanosleep_time32 +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 sys_vm86 sys_ni_syscall 167 i386 query_module -168 i386 poll sys_poll __ia32_sys_poll +168 i386 poll sys_poll 169 i386 nfsservctl -170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 -171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 -172 i386 prctl sys_prctl __ia32_sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn __ia32_compat_sys_rt_sigreturn -174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask -176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending -177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32 -178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_compat_sys_rt_sigsuspend -180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread -181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite -182 i386 chown sys_chown16 __ia32_sys_chown16 -183 i386 getcwd sys_getcwd __ia32_sys_getcwd -184 i386 capget sys_capget __ia32_sys_capget -185 i386 capset sys_capset __ia32_sys_capset -186 i386 sigaltstack sys_sigaltstack __ia32_compat_sys_sigaltstack -187 i386 sendfile sys_sendfile __ia32_compat_sys_sendfile +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32 +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend +180 i386 pread64 sys_ia32_pread64 +181 i386 pwrite64 sys_ia32_pwrite64 +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 i386 sendfile sys_sendfile compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork __ia32_sys_vfork -191 i386 ugetrlimit sys_getrlimit __ia32_compat_sys_getrlimit -192 i386 mmap2 sys_mmap_pgoff __ia32_sys_mmap_pgoff -193 i386 truncate64 sys_truncate64 __ia32_compat_sys_x86_truncate64 -194 i386 ftruncate64 sys_ftruncate64 __ia32_compat_sys_x86_ftruncate64 -195 i386 stat64 sys_stat64 __ia32_compat_sys_x86_stat64 -196 i386 lstat64 sys_lstat64 __ia32_compat_sys_x86_lstat64 -197 i386 fstat64 sys_fstat64 __ia32_compat_sys_x86_fstat64 -198 i386 lchown32 sys_lchown __ia32_sys_lchown -199 i386 getuid32 sys_getuid __ia32_sys_getuid -200 i386 getgid32 sys_getgid __ia32_sys_getgid -201 i386 geteuid32 sys_geteuid __ia32_sys_geteuid -202 i386 getegid32 sys_getegid __ia32_sys_getegid -203 i386 setreuid32 sys_setreuid __ia32_sys_setreuid -204 i386 setregid32 sys_setregid __ia32_sys_setregid -205 i386 getgroups32 sys_getgroups __ia32_sys_getgroups -206 i386 setgroups32 sys_setgroups __ia32_sys_setgroups -207 i386 fchown32 sys_fchown __ia32_sys_fchown -208 i386 setresuid32 sys_setresuid __ia32_sys_setresuid -209 i386 getresuid32 sys_getresuid __ia32_sys_getresuid -210 i386 setresgid32 sys_setresgid __ia32_sys_setresgid -211 i386 getresgid32 sys_getresgid __ia32_sys_getresgid -212 i386 chown32 sys_chown __ia32_sys_chown -213 i386 setuid32 sys_setuid __ia32_sys_setuid -214 i386 setgid32 sys_setgid __ia32_sys_setgid -215 i386 setfsuid32 sys_setfsuid __ia32_sys_setfsuid -216 i386 setfsgid32 sys_setfsgid __ia32_sys_setfsgid -217 i386 pivot_root sys_pivot_root __ia32_sys_pivot_root -218 i386 mincore sys_mincore __ia32_sys_mincore -219 i386 madvise sys_madvise __ia32_sys_madvise -220 i386 getdents64 sys_getdents64 __ia32_sys_getdents64 -221 i386 fcntl64 sys_fcntl64 __ia32_compat_sys_fcntl64 +190 i386 vfork sys_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_ia32_truncate64 +194 i386 ftruncate64 sys_ia32_ftruncate64 +195 i386 stat64 sys_stat64 compat_sys_ia32_stat64 +196 i386 lstat64 sys_lstat64 compat_sys_ia32_lstat64 +197 i386 fstat64 sys_fstat64 compat_sys_ia32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 # 222 is unused # 223 is unused -224 i386 gettid sys_gettid __ia32_sys_gettid -225 i386 readahead sys_readahead __ia32_compat_sys_x86_readahead -226 i386 setxattr sys_setxattr __ia32_sys_setxattr -227 i386 lsetxattr sys_lsetxattr __ia32_sys_lsetxattr -228 i386 fsetxattr sys_fsetxattr __ia32_sys_fsetxattr -229 i386 getxattr sys_getxattr __ia32_sys_getxattr -230 i386 lgetxattr sys_lgetxattr __ia32_sys_lgetxattr -231 i386 fgetxattr sys_fgetxattr __ia32_sys_fgetxattr -232 i386 listxattr sys_listxattr __ia32_sys_listxattr -233 i386 llistxattr sys_llistxattr __ia32_sys_llistxattr -234 i386 flistxattr sys_flistxattr __ia32_sys_flistxattr -235 i386 removexattr sys_removexattr __ia32_sys_removexattr -236 i386 lremovexattr sys_lremovexattr __ia32_sys_lremovexattr -237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr -238 i386 tkill sys_tkill __ia32_sys_tkill -239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64 -240 i386 futex sys_futex_time32 __ia32_sys_futex_time32 -241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity -242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity -243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area -244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area -245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup -246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy -247 i386 io_getevents sys_io_getevents_time32 __ia32_sys_io_getevents_time32 -248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit -249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel -250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64 +224 i386 gettid sys_gettid +225 i386 readahead sys_ia32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex_time32 +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents_time32 +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_ia32_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) -252 i386 exit_group sys_exit_group __ia32_sys_exit_group -253 i386 lookup_dcookie sys_lookup_dcookie __ia32_compat_sys_lookup_dcookie -254 i386 epoll_create sys_epoll_create __ia32_sys_epoll_create -255 i386 epoll_ctl sys_epoll_ctl __ia32_sys_epoll_ctl -256 i386 epoll_wait sys_epoll_wait __ia32_sys_epoll_wait -257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages -258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address -259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create -260 i386 timer_settime sys_timer_settime32 __ia32_sys_timer_settime32 -261 i386 timer_gettime sys_timer_gettime32 __ia32_sys_timer_gettime32 -262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun -263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete -264 i386 clock_settime sys_clock_settime32 __ia32_sys_clock_settime32 -265 i386 clock_gettime sys_clock_gettime32 __ia32_sys_clock_gettime32 -266 i386 clock_getres sys_clock_getres_time32 __ia32_sys_clock_getres_time32 -267 i386 clock_nanosleep sys_clock_nanosleep_time32 __ia32_sys_clock_nanosleep_time32 -268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64 -269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64 -270 i386 tgkill sys_tgkill __ia32_sys_tgkill -271 i386 utimes sys_utimes_time32 __ia32_sys_utimes_time32 -272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64 +252 i386 exit_group sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime32 +261 i386 timer_gettime sys_timer_gettime32 +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime32 +265 i386 clock_gettime sys_clock_gettime32 +266 i386 clock_getres sys_clock_getres_time32 +267 i386 clock_nanosleep sys_clock_nanosleep_time32 +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes_time32 +272 i386 fadvise64_64 sys_ia32_fadvise64_64 273 i386 vserver -274 i386 mbind sys_mbind __ia32_sys_mbind -275 i386 get_mempolicy sys_get_mempolicy __ia32_compat_sys_get_mempolicy -276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy -277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open -278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink -279 i386 mq_timedsend sys_mq_timedsend_time32 __ia32_sys_mq_timedsend_time32 -280 i386 mq_timedreceive sys_mq_timedreceive_time32 __ia32_sys_mq_timedreceive_time32 -281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify -282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr -283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load -284 i386 waitid sys_waitid __ia32_compat_sys_waitid +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend_time32 +280 i386 mq_timedreceive sys_mq_timedreceive_time32 +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid # 285 sys_setaltroot -286 i386 add_key sys_add_key __ia32_sys_add_key -287 i386 request_key sys_request_key __ia32_sys_request_key -288 i386 keyctl sys_keyctl __ia32_compat_sys_keyctl -289 i386 ioprio_set sys_ioprio_set __ia32_sys_ioprio_set -290 i386 ioprio_get sys_ioprio_get __ia32_sys_ioprio_get -291 i386 inotify_init sys_inotify_init __ia32_sys_inotify_init -292 i386 inotify_add_watch sys_inotify_add_watch __ia32_sys_inotify_add_watch -293 i386 inotify_rm_watch sys_inotify_rm_watch __ia32_sys_inotify_rm_watch -294 i386 migrate_pages sys_migrate_pages __ia32_sys_migrate_pages -295 i386 openat sys_openat __ia32_compat_sys_openat -296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat -297 i386 mknodat sys_mknodat __ia32_sys_mknodat -298 i386 fchownat sys_fchownat __ia32_sys_fchownat -299 i386 futimesat sys_futimesat_time32 __ia32_sys_futimesat_time32 -300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat -301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat -302 i386 renameat sys_renameat __ia32_sys_renameat -303 i386 linkat sys_linkat __ia32_sys_linkat -304 i386 symlinkat sys_symlinkat __ia32_sys_symlinkat -305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat -306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat -307 i386 faccessat sys_faccessat __ia32_sys_faccessat -308 i386 pselect6 sys_pselect6_time32 __ia32_compat_sys_pselect6_time32 -309 i386 ppoll sys_ppoll_time32 __ia32_compat_sys_ppoll_time32 -310 i386 unshare sys_unshare __ia32_sys_unshare -311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list -312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list -313 i386 splice sys_splice __ia32_sys_splice -314 i386 sync_file_range sys_sync_file_range __ia32_compat_sys_x86_sync_file_range -315 i386 tee sys_tee __ia32_sys_tee -316 i386 vmsplice sys_vmsplice __ia32_compat_sys_vmsplice -317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages -318 i386 getcpu sys_getcpu __ia32_sys_getcpu -319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait -320 i386 utimensat sys_utimensat_time32 __ia32_sys_utimensat_time32 -321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd -322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create -323 i386 eventfd sys_eventfd __ia32_sys_eventfd -324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate -325 i386 timerfd_settime sys_timerfd_settime32 __ia32_sys_timerfd_settime32 -326 i386 timerfd_gettime sys_timerfd_gettime32 __ia32_sys_timerfd_gettime32 -327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4 -328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2 -329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1 -330 i386 dup3 sys_dup3 __ia32_sys_dup3 -331 i386 pipe2 sys_pipe2 __ia32_sys_pipe2 -332 i386 inotify_init1 sys_inotify_init1 __ia32_sys_inotify_init1 -333 i386 preadv sys_preadv __ia32_compat_sys_preadv -334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev -335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo -336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open -337 i386 recvmmsg sys_recvmmsg_time32 __ia32_compat_sys_recvmmsg_time32 -338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init -339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark -340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64 -341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at -342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at -343 i386 clock_adjtime sys_clock_adjtime32 __ia32_sys_clock_adjtime32 -344 i386 syncfs sys_syncfs __ia32_sys_syncfs -345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg -346 i386 setns sys_setns __ia32_sys_setns -347 i386 process_vm_readv sys_process_vm_readv __ia32_compat_sys_process_vm_readv -348 i386 process_vm_writev sys_process_vm_writev __ia32_compat_sys_process_vm_writev -349 i386 kcmp sys_kcmp __ia32_sys_kcmp -350 i386 finit_module sys_finit_module __ia32_sys_finit_module -351 i386 sched_setattr sys_sched_setattr __ia32_sys_sched_setattr -352 i386 sched_getattr sys_sched_getattr __ia32_sys_sched_getattr -353 i386 renameat2 sys_renameat2 __ia32_sys_renameat2 -354 i386 seccomp sys_seccomp __ia32_sys_seccomp -355 i386 getrandom sys_getrandom __ia32_sys_getrandom -356 i386 memfd_create sys_memfd_create __ia32_sys_memfd_create -357 i386 bpf sys_bpf __ia32_sys_bpf -358 i386 execveat sys_execveat __ia32_compat_sys_execveat -359 i386 socket sys_socket __ia32_sys_socket -360 i386 socketpair sys_socketpair __ia32_sys_socketpair -361 i386 bind sys_bind __ia32_sys_bind -362 i386 connect sys_connect __ia32_sys_connect -363 i386 listen sys_listen __ia32_sys_listen -364 i386 accept4 sys_accept4 __ia32_sys_accept4 -365 i386 getsockopt sys_getsockopt __ia32_compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt __ia32_compat_sys_setsockopt -367 i386 getsockname sys_getsockname __ia32_sys_getsockname -368 i386 getpeername sys_getpeername __ia32_sys_getpeername -369 i386 sendto sys_sendto __ia32_sys_sendto -370 i386 sendmsg sys_sendmsg __ia32_compat_sys_sendmsg -371 i386 recvfrom sys_recvfrom __ia32_compat_sys_recvfrom -372 i386 recvmsg sys_recvmsg __ia32_compat_sys_recvmsg -373 i386 shutdown sys_shutdown __ia32_sys_shutdown -374 i386 userfaultfd sys_userfaultfd __ia32_sys_userfaultfd -375 i386 membarrier sys_membarrier __ia32_sys_membarrier -376 i386 mlock2 sys_mlock2 __ia32_sys_mlock2 -377 i386 copy_file_range sys_copy_file_range __ia32_sys_copy_file_range -378 i386 preadv2 sys_preadv2 __ia32_compat_sys_preadv2 -379 i386 pwritev2 sys_pwritev2 __ia32_compat_sys_pwritev2 -380 i386 pkey_mprotect sys_pkey_mprotect __ia32_sys_pkey_mprotect -381 i386 pkey_alloc sys_pkey_alloc __ia32_sys_pkey_alloc -382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free -383 i386 statx sys_statx __ia32_sys_statx -384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl -385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents -386 i386 rseq sys_rseq __ia32_sys_rseq -393 i386 semget sys_semget __ia32_sys_semget -394 i386 semctl sys_semctl __ia32_compat_sys_semctl -395 i386 shmget sys_shmget __ia32_sys_shmget -396 i386 shmctl sys_shmctl __ia32_compat_sys_shmctl -397 i386 shmat sys_shmat __ia32_compat_sys_shmat -398 i386 shmdt sys_shmdt __ia32_sys_shmdt -399 i386 msgget sys_msgget __ia32_sys_msgget -400 i386 msgsnd sys_msgsnd __ia32_compat_sys_msgsnd -401 i386 msgrcv sys_msgrcv __ia32_compat_sys_msgrcv -402 i386 msgctl sys_msgctl __ia32_compat_sys_msgctl -403 i386 clock_gettime64 sys_clock_gettime __ia32_sys_clock_gettime -404 i386 clock_settime64 sys_clock_settime __ia32_sys_clock_settime -405 i386 clock_adjtime64 sys_clock_adjtime __ia32_sys_clock_adjtime -406 i386 clock_getres_time64 sys_clock_getres __ia32_sys_clock_getres -407 i386 clock_nanosleep_time64 sys_clock_nanosleep __ia32_sys_clock_nanosleep -408 i386 timer_gettime64 sys_timer_gettime __ia32_sys_timer_gettime -409 i386 timer_settime64 sys_timer_settime __ia32_sys_timer_settime -410 i386 timerfd_gettime64 sys_timerfd_gettime __ia32_sys_timerfd_gettime -411 i386 timerfd_settime64 sys_timerfd_settime __ia32_sys_timerfd_settime -412 i386 utimensat_time64 sys_utimensat __ia32_sys_utimensat -413 i386 pselect6_time64 sys_pselect6 __ia32_compat_sys_pselect6_time64 -414 i386 ppoll_time64 sys_ppoll __ia32_compat_sys_ppoll_time64 -416 i386 io_pgetevents_time64 sys_io_pgetevents __ia32_sys_io_pgetevents -417 i386 recvmmsg_time64 sys_recvmmsg __ia32_compat_sys_recvmmsg_time64 -418 i386 mq_timedsend_time64 sys_mq_timedsend __ia32_sys_mq_timedsend -419 i386 mq_timedreceive_time64 sys_mq_timedreceive __ia32_sys_mq_timedreceive -420 i386 semtimedop_time64 sys_semtimedop __ia32_sys_semtimedop -421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64 -422 i386 futex_time64 sys_futex __ia32_sys_futex -423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval -424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal -425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup -426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter -427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register -428 i386 open_tree sys_open_tree __ia32_sys_open_tree -429 i386 move_mount sys_move_mount __ia32_sys_move_mount -430 i386 fsopen sys_fsopen __ia32_sys_fsopen -431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig -432 i386 fsmount sys_fsmount __ia32_sys_fsmount -433 i386 fspick sys_fspick __ia32_sys_fspick -434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open -435 i386 clone3 sys_clone3 __ia32_sys_clone3 -437 i386 openat2 sys_openat2 __ia32_sys_openat2 -438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl compat_sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat_time32 +300 i386 fstatat64 sys_fstatat64 compat_sys_ia32_fstatat64 +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32 +309 i386 ppoll sys_ppoll_time32 compat_sys_ppoll_time32 +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_ia32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +317 i386 move_pages sys_move_pages compat_sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat_time32 +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_ia32_fallocate +325 i386 timerfd_settime sys_timerfd_settime32 +326 i386 timerfd_gettime sys_timerfd_gettime32 +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime32 +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp +350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp +355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf +358 i386 execveat sys_execveat compat_sys_execveat +359 i386 socket sys_socket +360 i386 socketpair sys_socketpair +361 i386 bind sys_bind +362 i386 connect sys_connect +363 i386 listen sys_listen +364 i386 accept4 sys_accept4 +365 i386 getsockopt sys_getsockopt compat_sys_getsockopt +366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +367 i386 getsockname sys_getsockname +368 i386 getpeername sys_getpeername +369 i386 sendto sys_sendto +370 i386 sendmsg sys_sendmsg compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg compat_sys_recvmsg +373 i386 shutdown sys_shutdown +374 i386 userfaultfd sys_userfaultfd +375 i386 membarrier sys_membarrier +376 i386 mlock2 sys_mlock2 +377 i386 copy_file_range sys_copy_file_range +378 i386 preadv2 sys_preadv2 compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 +380 i386 pkey_mprotect sys_pkey_mprotect +381 i386 pkey_alloc sys_pkey_alloc +382 i386 pkey_free sys_pkey_free +383 i386 statx sys_statx +384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents +386 i386 rseq sys_rseq +393 i386 semget sys_semget +394 i386 semctl sys_semctl compat_sys_semctl +395 i386 shmget sys_shmget +396 i386 shmctl sys_shmctl compat_sys_shmctl +397 i386 shmat sys_shmat compat_sys_shmat +398 i386 shmdt sys_shmdt +399 i386 msgget sys_msgget +400 i386 msgsnd sys_msgsnd compat_sys_msgsnd +401 i386 msgrcv sys_msgrcv compat_sys_msgrcv +402 i386 msgctl sys_msgctl compat_sys_msgctl +403 i386 clock_gettime64 sys_clock_gettime +404 i386 clock_settime64 sys_clock_settime +405 i386 clock_adjtime64 sys_clock_adjtime +406 i386 clock_getres_time64 sys_clock_getres +407 i386 clock_nanosleep_time64 sys_clock_nanosleep +408 i386 timer_gettime64 sys_timer_gettime +409 i386 timer_settime64 sys_timer_settime +410 i386 timerfd_gettime64 sys_timerfd_gettime +411 i386 timerfd_settime64 sys_timerfd_settime +412 i386 utimensat_time64 sys_utimensat +413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 +414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 +416 i386 io_pgetevents_time64 sys_io_pgetevents +417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 +418 i386 mq_timedsend_time64 sys_mq_timedsend +419 i386 mq_timedreceive_time64 sys_mq_timedreceive +420 i386 semtimedop_time64 sys_semtimedop +421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +422 i386 futex_time64 sys_futex +423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 i386 pidfd_send_signal sys_pidfd_send_signal +425 i386 io_uring_setup sys_io_uring_setup +426 i386 io_uring_enter sys_io_uring_enter +427 i386 io_uring_register sys_io_uring_register +428 i386 open_tree sys_open_tree +429 i386 move_mount sys_move_mount +430 i386 fsopen sys_fsopen +431 i386 fsconfig sys_fsconfig +432 i386 fsmount sys_fsmount +433 i386 fspick sys_fspick +434 i386 pidfd_open sys_pidfd_open +435 i386 clone3 sys_clone3 +437 i386 openat2 sys_openat2 +438 i386 pidfd_getfd sys_pidfd_getfd diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 44d510bc9b78..37b844f839bc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -8,357 +8,357 @@ # # The abi is "common", "64" or "x32" for this file. # -0 common read __x64_sys_read -1 common write __x64_sys_write -2 common open __x64_sys_open -3 common close __x64_sys_close -4 common stat __x64_sys_newstat -5 common fstat __x64_sys_newfstat -6 common lstat __x64_sys_newlstat -7 common poll __x64_sys_poll -8 common lseek __x64_sys_lseek -9 common mmap __x64_sys_mmap -10 common mprotect __x64_sys_mprotect -11 common munmap __x64_sys_munmap -12 common brk __x64_sys_brk -13 64 rt_sigaction __x64_sys_rt_sigaction -14 common rt_sigprocmask __x64_sys_rt_sigprocmask -15 64 rt_sigreturn __x64_sys_rt_sigreturn/ptregs -16 64 ioctl __x64_sys_ioctl -17 common pread64 __x64_sys_pread64 -18 common pwrite64 __x64_sys_pwrite64 -19 64 readv __x64_sys_readv -20 64 writev __x64_sys_writev -21 common access __x64_sys_access -22 common pipe __x64_sys_pipe -23 common select __x64_sys_select -24 common sched_yield __x64_sys_sched_yield -25 common mremap __x64_sys_mremap -26 common msync __x64_sys_msync -27 common mincore __x64_sys_mincore -28 common madvise __x64_sys_madvise -29 common shmget __x64_sys_shmget -30 common shmat __x64_sys_shmat -31 common shmctl __x64_sys_shmctl -32 common dup __x64_sys_dup -33 common dup2 __x64_sys_dup2 -34 common pause __x64_sys_pause -35 common nanosleep __x64_sys_nanosleep -36 common getitimer __x64_sys_getitimer -37 common alarm __x64_sys_alarm -38 common setitimer __x64_sys_setitimer -39 common getpid __x64_sys_getpid -40 common sendfile __x64_sys_sendfile64 -41 common socket __x64_sys_socket -42 common connect __x64_sys_connect -43 common accept __x64_sys_accept -44 common sendto __x64_sys_sendto -45 64 recvfrom __x64_sys_recvfrom -46 64 sendmsg __x64_sys_sendmsg -47 64 recvmsg __x64_sys_recvmsg -48 common shutdown __x64_sys_shutdown -49 common bind __x64_sys_bind -50 common listen __x64_sys_listen -51 common getsockname __x64_sys_getsockname -52 common getpeername __x64_sys_getpeername -53 common socketpair __x64_sys_socketpair -54 64 setsockopt __x64_sys_setsockopt -55 64 getsockopt __x64_sys_getsockopt -56 common clone __x64_sys_clone/ptregs -57 common fork __x64_sys_fork/ptregs -58 common vfork __x64_sys_vfork/ptregs -59 64 execve __x64_sys_execve/ptregs -60 common exit __x64_sys_exit -61 common wait4 __x64_sys_wait4 -62 common kill __x64_sys_kill -63 common uname __x64_sys_newuname -64 common semget __x64_sys_semget -65 common semop __x64_sys_semop -66 common semctl __x64_sys_semctl -67 common shmdt __x64_sys_shmdt -68 common msgget __x64_sys_msgget -69 common msgsnd __x64_sys_msgsnd -70 common msgrcv __x64_sys_msgrcv -71 common msgctl __x64_sys_msgctl -72 common fcntl __x64_sys_fcntl -73 common flock __x64_sys_flock -74 common fsync __x64_sys_fsync -75 common fdatasync __x64_sys_fdatasync -76 common truncate __x64_sys_truncate -77 common ftruncate __x64_sys_ftruncate -78 common getdents __x64_sys_getdents -79 common getcwd __x64_sys_getcwd -80 common chdir __x64_sys_chdir -81 common fchdir __x64_sys_fchdir -82 common rename __x64_sys_rename -83 common mkdir __x64_sys_mkdir -84 common rmdir __x64_sys_rmdir -85 common creat __x64_sys_creat -86 common link __x64_sys_link -87 common unlink __x64_sys_unlink -88 common symlink __x64_sys_symlink -89 common readlink __x64_sys_readlink -90 common chmod __x64_sys_chmod -91 common fchmod __x64_sys_fchmod -92 common chown __x64_sys_chown -93 common fchown __x64_sys_fchown -94 common lchown __x64_sys_lchown -95 common umask __x64_sys_umask -96 common gettimeofday __x64_sys_gettimeofday -97 common getrlimit __x64_sys_getrlimit -98 common getrusage __x64_sys_getrusage -99 common sysinfo __x64_sys_sysinfo -100 common times __x64_sys_times -101 64 ptrace __x64_sys_ptrace -102 common getuid __x64_sys_getuid -103 common syslog __x64_sys_syslog -104 common getgid __x64_sys_getgid -105 common setuid __x64_sys_setuid -106 common setgid __x64_sys_setgid -107 common geteuid __x64_sys_geteuid -108 common getegid __x64_sys_getegid -109 common setpgid __x64_sys_setpgid -110 common getppid __x64_sys_getppid -111 common getpgrp __x64_sys_getpgrp -112 common setsid __x64_sys_setsid -113 common setreuid __x64_sys_setreuid -114 common setregid __x64_sys_setregid -115 common getgroups __x64_sys_getgroups -116 common setgroups __x64_sys_setgroups -117 common setresuid __x64_sys_setresuid -118 common getresuid __x64_sys_getresuid -119 common setresgid __x64_sys_setresgid -120 common getresgid __x64_sys_getresgid -121 common getpgid __x64_sys_getpgid -122 common setfsuid __x64_sys_setfsuid -123 common setfsgid __x64_sys_setfsgid -124 common getsid __x64_sys_getsid -125 common capget __x64_sys_capget -126 common capset __x64_sys_capset -127 64 rt_sigpending __x64_sys_rt_sigpending -128 64 rt_sigtimedwait __x64_sys_rt_sigtimedwait -129 64 rt_sigqueueinfo __x64_sys_rt_sigqueueinfo -130 common rt_sigsuspend __x64_sys_rt_sigsuspend -131 64 sigaltstack __x64_sys_sigaltstack -132 common utime __x64_sys_utime -133 common mknod __x64_sys_mknod +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 common rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn sys_rt_sigreturn +16 64 ioctl sys_ioctl +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 common clone sys_clone +57 common fork sys_fork +58 common vfork sys_vfork +59 64 execve sys_execve +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo +100 common times sys_times +101 64 ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 common rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack sys_sigaltstack +132 common utime sys_utime +133 common mknod sys_mknod 134 64 uselib -135 common personality __x64_sys_personality -136 common ustat __x64_sys_ustat -137 common statfs __x64_sys_statfs -138 common fstatfs __x64_sys_fstatfs -139 common sysfs __x64_sys_sysfs -140 common getpriority __x64_sys_getpriority -141 common setpriority __x64_sys_setpriority -142 common sched_setparam __x64_sys_sched_setparam -143 common sched_getparam __x64_sys_sched_getparam -144 common sched_setscheduler __x64_sys_sched_setscheduler -145 common sched_getscheduler __x64_sys_sched_getscheduler -146 common sched_get_priority_max __x64_sys_sched_get_priority_max -147 common sched_get_priority_min __x64_sys_sched_get_priority_min -148 common sched_rr_get_interval __x64_sys_sched_rr_get_interval -149 common mlock __x64_sys_mlock -150 common munlock __x64_sys_munlock -151 common mlockall __x64_sys_mlockall -152 common munlockall __x64_sys_munlockall -153 common vhangup __x64_sys_vhangup -154 common modify_ldt __x64_sys_modify_ldt -155 common pivot_root __x64_sys_pivot_root -156 64 _sysctl __x64_sys_sysctl -157 common prctl __x64_sys_prctl -158 common arch_prctl __x64_sys_arch_prctl -159 common adjtimex __x64_sys_adjtimex -160 common setrlimit __x64_sys_setrlimit -161 common chroot __x64_sys_chroot -162 common sync __x64_sys_sync -163 common acct __x64_sys_acct -164 common settimeofday __x64_sys_settimeofday -165 common mount __x64_sys_mount -166 common umount2 __x64_sys_umount -167 common swapon __x64_sys_swapon -168 common swapoff __x64_sys_swapoff -169 common reboot __x64_sys_reboot -170 common sethostname __x64_sys_sethostname -171 common setdomainname __x64_sys_setdomainname -172 common iopl __x64_sys_iopl/ptregs -173 common ioperm __x64_sys_ioperm +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl sys_iopl +173 common ioperm sys_ioperm 174 64 create_module -175 common init_module __x64_sys_init_module -176 common delete_module __x64_sys_delete_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module 177 64 get_kernel_syms 178 64 query_module -179 common quotactl __x64_sys_quotactl +179 common quotactl sys_quotactl 180 64 nfsservctl 181 common getpmsg 182 common putpmsg 183 common afs_syscall 184 common tuxcall 185 common security -186 common gettid __x64_sys_gettid -187 common readahead __x64_sys_readahead -188 common setxattr __x64_sys_setxattr -189 common lsetxattr __x64_sys_lsetxattr -190 common fsetxattr __x64_sys_fsetxattr -191 common getxattr __x64_sys_getxattr -192 common lgetxattr __x64_sys_lgetxattr -193 common fgetxattr __x64_sys_fgetxattr -194 common listxattr __x64_sys_listxattr -195 common llistxattr __x64_sys_llistxattr -196 common flistxattr __x64_sys_flistxattr -197 common removexattr __x64_sys_removexattr -198 common lremovexattr __x64_sys_lremovexattr -199 common fremovexattr __x64_sys_fremovexattr -200 common tkill __x64_sys_tkill -201 common time __x64_sys_time -202 common futex __x64_sys_futex -203 common sched_setaffinity __x64_sys_sched_setaffinity -204 common sched_getaffinity __x64_sys_sched_getaffinity +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity 205 64 set_thread_area -206 64 io_setup __x64_sys_io_setup -207 common io_destroy __x64_sys_io_destroy -208 common io_getevents __x64_sys_io_getevents -209 64 io_submit __x64_sys_io_submit -210 common io_cancel __x64_sys_io_cancel +206 64 io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 common io_cancel sys_io_cancel 211 64 get_thread_area -212 common lookup_dcookie __x64_sys_lookup_dcookie -213 common epoll_create __x64_sys_epoll_create +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old -216 common remap_file_pages __x64_sys_remap_file_pages -217 common getdents64 __x64_sys_getdents64 -218 common set_tid_address __x64_sys_set_tid_address -219 common restart_syscall __x64_sys_restart_syscall -220 common semtimedop __x64_sys_semtimedop -221 common fadvise64 __x64_sys_fadvise64 -222 64 timer_create __x64_sys_timer_create -223 common timer_settime __x64_sys_timer_settime -224 common timer_gettime __x64_sys_timer_gettime -225 common timer_getoverrun __x64_sys_timer_getoverrun -226 common timer_delete __x64_sys_timer_delete -227 common clock_settime __x64_sys_clock_settime -228 common clock_gettime __x64_sys_clock_gettime -229 common clock_getres __x64_sys_clock_getres -230 common clock_nanosleep __x64_sys_clock_nanosleep -231 common exit_group __x64_sys_exit_group -232 common epoll_wait __x64_sys_epoll_wait -233 common epoll_ctl __x64_sys_epoll_ctl -234 common tgkill __x64_sys_tgkill -235 common utimes __x64_sys_utimes +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes 236 64 vserver -237 common mbind __x64_sys_mbind -238 common set_mempolicy __x64_sys_set_mempolicy -239 common get_mempolicy __x64_sys_get_mempolicy -240 common mq_open __x64_sys_mq_open -241 common mq_unlink __x64_sys_mq_unlink -242 common mq_timedsend __x64_sys_mq_timedsend -243 common mq_timedreceive __x64_sys_mq_timedreceive -244 64 mq_notify __x64_sys_mq_notify -245 common mq_getsetattr __x64_sys_mq_getsetattr -246 64 kexec_load __x64_sys_kexec_load -247 64 waitid __x64_sys_waitid -248 common add_key __x64_sys_add_key -249 common request_key __x64_sys_request_key -250 common keyctl __x64_sys_keyctl -251 common ioprio_set __x64_sys_ioprio_set -252 common ioprio_get __x64_sys_ioprio_get -253 common inotify_init __x64_sys_inotify_init -254 common inotify_add_watch __x64_sys_inotify_add_watch -255 common inotify_rm_watch __x64_sys_inotify_rm_watch -256 common migrate_pages __x64_sys_migrate_pages -257 common openat __x64_sys_openat -258 common mkdirat __x64_sys_mkdirat -259 common mknodat __x64_sys_mknodat -260 common fchownat __x64_sys_fchownat -261 common futimesat __x64_sys_futimesat -262 common newfstatat __x64_sys_newfstatat -263 common unlinkat __x64_sys_unlinkat -264 common renameat __x64_sys_renameat -265 common linkat __x64_sys_linkat -266 common symlinkat __x64_sys_symlinkat -267 common readlinkat __x64_sys_readlinkat -268 common fchmodat __x64_sys_fchmodat -269 common faccessat __x64_sys_faccessat -270 common pselect6 __x64_sys_pselect6 -271 common ppoll __x64_sys_ppoll -272 common unshare __x64_sys_unshare -273 64 set_robust_list __x64_sys_set_robust_list -274 64 get_robust_list __x64_sys_get_robust_list -275 common splice __x64_sys_splice -276 common tee __x64_sys_tee -277 common sync_file_range __x64_sys_sync_file_range -278 64 vmsplice __x64_sys_vmsplice -279 64 move_pages __x64_sys_move_pages -280 common utimensat __x64_sys_utimensat -281 common epoll_pwait __x64_sys_epoll_pwait -282 common signalfd __x64_sys_signalfd -283 common timerfd_create __x64_sys_timerfd_create -284 common eventfd __x64_sys_eventfd -285 common fallocate __x64_sys_fallocate -286 common timerfd_settime __x64_sys_timerfd_settime -287 common timerfd_gettime __x64_sys_timerfd_gettime -288 common accept4 __x64_sys_accept4 -289 common signalfd4 __x64_sys_signalfd4 -290 common eventfd2 __x64_sys_eventfd2 -291 common epoll_create1 __x64_sys_epoll_create1 -292 common dup3 __x64_sys_dup3 -293 common pipe2 __x64_sys_pipe2 -294 common inotify_init1 __x64_sys_inotify_init1 -295 64 preadv __x64_sys_preadv -296 64 pwritev __x64_sys_pwritev -297 64 rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo -298 common perf_event_open __x64_sys_perf_event_open -299 64 recvmmsg __x64_sys_recvmmsg -300 common fanotify_init __x64_sys_fanotify_init -301 common fanotify_mark __x64_sys_fanotify_mark -302 common prlimit64 __x64_sys_prlimit64 -303 common name_to_handle_at __x64_sys_name_to_handle_at -304 common open_by_handle_at __x64_sys_open_by_handle_at -305 common clock_adjtime __x64_sys_clock_adjtime -306 common syncfs __x64_sys_syncfs -307 64 sendmmsg __x64_sys_sendmmsg -308 common setns __x64_sys_setns -309 common getcpu __x64_sys_getcpu -310 64 process_vm_readv __x64_sys_process_vm_readv -311 64 process_vm_writev __x64_sys_process_vm_writev -312 common kcmp __x64_sys_kcmp -313 common finit_module __x64_sys_finit_module -314 common sched_setattr __x64_sys_sched_setattr -315 common sched_getattr __x64_sys_sched_getattr -316 common renameat2 __x64_sys_renameat2 -317 common seccomp __x64_sys_seccomp -318 common getrandom __x64_sys_getrandom -319 common memfd_create __x64_sys_memfd_create -320 common kexec_file_load __x64_sys_kexec_file_load -321 common bpf __x64_sys_bpf -322 64 execveat __x64_sys_execveat/ptregs -323 common userfaultfd __x64_sys_userfaultfd -324 common membarrier __x64_sys_membarrier -325 common mlock2 __x64_sys_mlock2 -326 common copy_file_range __x64_sys_copy_file_range -327 64 preadv2 __x64_sys_preadv2 -328 64 pwritev2 __x64_sys_pwritev2 -329 common pkey_mprotect __x64_sys_pkey_mprotect -330 common pkey_alloc __x64_sys_pkey_alloc -331 common pkey_free __x64_sys_pkey_free -332 common statx __x64_sys_statx -333 common io_pgetevents __x64_sys_io_pgetevents -334 common rseq __x64_sys_rseq +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 common mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 common perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 common setns sys_setns +309 common getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev +312 common kcmp sys_kcmp +313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf +322 64 execveat sys_execveat +323 common userfaultfd sys_userfaultfd +324 common membarrier sys_membarrier +325 common mlock2 sys_mlock2 +326 common copy_file_range sys_copy_file_range +327 64 preadv2 sys_preadv2 +328 64 pwritev2 sys_pwritev2 +329 common pkey_mprotect sys_pkey_mprotect +330 common pkey_alloc sys_pkey_alloc +331 common pkey_free sys_pkey_free +332 common statx sys_statx +333 common io_pgetevents sys_io_pgetevents +334 common rseq sys_rseq # don't use numbers 387 through 423, add new calls after the last # 'common' entry -424 common pidfd_send_signal __x64_sys_pidfd_send_signal -425 common io_uring_setup __x64_sys_io_uring_setup -426 common io_uring_enter __x64_sys_io_uring_enter -427 common io_uring_register __x64_sys_io_uring_register -428 common open_tree __x64_sys_open_tree -429 common move_mount __x64_sys_move_mount -430 common fsopen __x64_sys_fsopen -431 common fsconfig __x64_sys_fsconfig -432 common fsmount __x64_sys_fsmount -433 common fspick __x64_sys_fspick -434 common pidfd_open __x64_sys_pidfd_open -435 common clone3 __x64_sys_clone3/ptregs -437 common openat2 __x64_sys_openat2 -438 common pidfd_getfd __x64_sys_pidfd_getfd +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 +437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd # # x32-specific system call numbers start at 512 to avoid cache impact @@ -366,39 +366,39 @@ # on-the-fly for compat_sys_*() compatibility system calls if X86_X32 # is defined. # -512 x32 rt_sigaction __x32_compat_sys_rt_sigaction -513 x32 rt_sigreturn sys32_x32_rt_sigreturn -514 x32 ioctl __x32_compat_sys_ioctl -515 x32 readv __x32_compat_sys_readv -516 x32 writev __x32_compat_sys_writev -517 x32 recvfrom __x32_compat_sys_recvfrom -518 x32 sendmsg __x32_compat_sys_sendmsg -519 x32 recvmsg __x32_compat_sys_recvmsg -520 x32 execve __x32_compat_sys_execve/ptregs -521 x32 ptrace __x32_compat_sys_ptrace -522 x32 rt_sigpending __x32_compat_sys_rt_sigpending -523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait_time64 -524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo -525 x32 sigaltstack __x32_compat_sys_sigaltstack -526 x32 timer_create __x32_compat_sys_timer_create -527 x32 mq_notify __x32_compat_sys_mq_notify -528 x32 kexec_load __x32_compat_sys_kexec_load -529 x32 waitid __x32_compat_sys_waitid -530 x32 set_robust_list __x32_compat_sys_set_robust_list -531 x32 get_robust_list __x32_compat_sys_get_robust_list -532 x32 vmsplice __x32_compat_sys_vmsplice -533 x32 move_pages __x32_compat_sys_move_pages -534 x32 preadv __x32_compat_sys_preadv64 -535 x32 pwritev __x32_compat_sys_pwritev64 -536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo -537 x32 recvmmsg __x32_compat_sys_recvmmsg_time64 -538 x32 sendmmsg __x32_compat_sys_sendmmsg -539 x32 process_vm_readv __x32_compat_sys_process_vm_readv -540 x32 process_vm_writev __x32_compat_sys_process_vm_writev -541 x32 setsockopt __x32_compat_sys_setsockopt -542 x32 getsockopt __x32_compat_sys_getsockopt -543 x32 io_setup __x32_compat_sys_io_setup -544 x32 io_submit __x32_compat_sys_io_submit -545 x32 execveat __x32_compat_sys_execveat/ptregs -546 x32 preadv2 __x32_compat_sys_preadv64v2 -547 x32 pwritev2 __x32_compat_sys_pwritev64v2 +512 x32 rt_sigaction compat_sys_rt_sigaction +513 x32 rt_sigreturn compat_sys_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve compat_sys_execve +521 x32 ptrace compat_sys_ptrace +522 x32 rt_sigpending compat_sys_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo +525 x32 sigaltstack compat_sys_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg_time64 +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit +545 x32 execveat compat_sys_execveat +546 x32 preadv2 compat_sys_preadv64v2 +547 x32 pwritev2 compat_sys_pwritev64v2 diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh index 12fbbcfe7ef3..cc1e63857427 100644 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -15,14 +15,21 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "#define ${fileguard} 1" echo "" + max=0 while read nr abi name entry ; do if [ -z "$offset" ]; then echo "#define __NR_${prefix}${name} $nr" else echo "#define __NR_${prefix}${name} ($offset + $nr)" fi + + max=$nr done echo "" + echo "#ifdef __KERNEL__" + echo "#define __NR_${prefix}syscall_max $max" + echo "#endif" + echo "" echo "#endif /* ${fileguard} */" ) > "$out" diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 1af2be39e7d9..929bde120d6b 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -9,15 +9,7 @@ syscall_macro() { local nr="$2" local entry="$3" - # Entry can be either just a function name or "function/qualifier" - real_entry="${entry%%/*}" - if [ "$entry" = "$real_entry" ]; then - qualifier= - else - qualifier=${entry#*/} - fi - - echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)" + echo "__SYSCALL_${abi}($nr, $entry)" } emit() { @@ -25,27 +17,15 @@ emit() { local nr="$2" local entry="$3" local compat="$4" - local umlentry="" if [ "$abi" != "I386" -a -n "$compat" ]; then echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 exit 1 fi - # For CONFIG_UML, we need to strip the __x64_sys prefix - if [ "$abi" = "64" -a "${entry}" != "${entry#__x64_sys}" ]; then - umlentry="sys${entry#__x64_sys}" - fi - if [ -z "$compat" ]; then - if [ -n "$entry" -a -z "$umlentry" ]; then - syscall_macro "$abi" "$nr" "$entry" - elif [ -n "$umlentry" ]; then # implies -n "$entry" - echo "#ifdef CONFIG_X86" + if [ -n "$entry" ]; then syscall_macro "$abi" "$nr" "$entry" - echo "#else /* CONFIG_UML */" - syscall_macro "$abi" "$nr" "$umlentry" - echo "#endif" fi else echo "#ifdef CONFIG_X86_32" @@ -61,24 +41,6 @@ emit() { grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then - emit 64 "$nr" "$entry" "$compat" - if [ "$abi" = "COMMON" ]; then - # COMMON means that this syscall exists in the same form for - # 64-bit and X32. - echo "#ifdef CONFIG_X86_X32_ABI" - emit X32 "$nr" "$entry" "$compat" - echo "#endif" - fi - elif [ "$abi" = "X32" ]; then - echo "#ifdef CONFIG_X86_X32_ABI" - emit X32 "$nr" "$entry" "$compat" - echo "#endif" - elif [ "$abi" = "I386" ]; then - emit "$abi" "$nr" "$entry" "$compat" - else - echo "Unknown abi $abi" >&2 - exit 1 - fi + emit "$abi" "$nr" "$entry" "$compat" done ) > "$out" diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index e010d4ae11f1..3a07ce3ec70b 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -35,9 +35,9 @@ SYM_CODE_END(\name) #endif #ifdef CONFIG_PREEMPTION - THUNK ___preempt_schedule, preempt_schedule - THUNK ___preempt_schedule_notrace, preempt_schedule_notrace - EXPORT_SYMBOL(___preempt_schedule) - EXPORT_SYMBOL(___preempt_schedule_notrace) + THUNK preempt_schedule_thunk, preempt_schedule + THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace + EXPORT_SYMBOL(preempt_schedule_thunk) + EXPORT_SYMBOL(preempt_schedule_notrace_thunk) #endif diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index c5c3b6e86e62..dbe4493b534e 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -47,10 +47,10 @@ SYM_FUNC_END(\name) #endif #ifdef CONFIG_PREEMPTION - THUNK ___preempt_schedule, preempt_schedule - THUNK ___preempt_schedule_notrace, preempt_schedule_notrace - EXPORT_SYMBOL(___preempt_schedule) - EXPORT_SYMBOL(___preempt_schedule_notrace) + THUNK preempt_schedule_thunk, preempt_schedule + THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace + EXPORT_SYMBOL(preempt_schedule_thunk) + EXPORT_SYMBOL(preempt_schedule_notrace_thunk) #endif #if defined(CONFIG_TRACE_IRQFLAGS) \ diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index ea7e0155c604..4d152933547d 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -57,6 +57,13 @@ SECTIONS *(.gnu.linkonce.b.*) } :text + /* + * Discard .note.gnu.property sections which are unused and have + * different alignment requirement from vDSO note sections. + */ + /DISCARD/ : { + *(.note.gnu.property) + } .note : { *(.note.*) } :text :note .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 9242b28418d5..1e82bd43286c 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -13,6 +13,7 @@ */ #undef CONFIG_64BIT #undef CONFIG_X86_64 +#undef CONFIG_COMPAT #undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index c1b8496b5606..43428cc514c8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -38,6 +38,8 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page) } #undef EMIT_VVAR +unsigned int vclocks_used __read_mostly; + #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; #endif @@ -219,7 +221,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); - if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { + if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) { return vmf_insert_pfn_prot(vma, vmf->address, __pa(pvti) >> PAGE_SHIFT, pgprot_decrypted(vma->vm_page_prot)); @@ -227,7 +229,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, } else if (sym_offset == image->sym_hvclock_page) { struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); - if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) + if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, virt_to_phys(tsc_pg) >> PAGE_SHIFT); } else if (sym_offset == image->sym_timens_page) { @@ -445,6 +447,8 @@ __setup("vdso=", vdso_setup); static int __init init_vdso(void) { + BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32); + init_vdso_image(&vdso_image_64); #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 1f22b6bbda68..39eb276d0277 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -250,6 +250,7 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index abef51320e3a..43b09e9c93a2 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -259,7 +259,7 @@ static int power_cpu_init(unsigned int cpu) } static const struct x86_cpu_id cpu_match[] = { - { .vendor = X86_VENDOR_AMD, .family = 0x15 }, + X86_MATCH_VENDOR_FAM(AMD, 0x15, NULL), {}, }; diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index a6ea07f2aa84..76400c052b0e 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -180,6 +180,31 @@ static void amd_uncore_del(struct perf_event *event, int flags) hwc->idx = -1; } +/* + * Convert logical CPU number to L3 PMC Config ThreadMask format + */ +static u64 l3_thread_slice_mask(int cpu) +{ + u64 thread_mask, core = topology_core_id(cpu); + unsigned int shift, thread = 0; + + if (topology_smt_supported() && !topology_is_primary_thread(cpu)) + thread = 1; + + if (boot_cpu_data.x86 <= 0x18) { + shift = AMD64_L3_THREAD_SHIFT + 2 * (core % 4) + thread; + thread_mask = BIT_ULL(shift); + + return AMD64_L3_SLICE_MASK | thread_mask; + } + + core = (core << AMD64_L3_COREID_SHIFT) & AMD64_L3_COREID_MASK; + shift = AMD64_L3_THREAD_SHIFT + thread; + thread_mask = BIT_ULL(shift); + + return AMD64_L3_EN_ALL_SLICES | core | thread_mask; +} + static int amd_uncore_event_init(struct perf_event *event) { struct amd_uncore *uncore; @@ -190,15 +215,12 @@ static int amd_uncore_event_init(struct perf_event *event) /* * NB and Last level cache counters (MSRs) are shared across all cores - * that share the same NB / Last level cache. Interrupts can be directed - * to a single target core, however, event counts generated by processes - * running on other cores cannot be masked out. So we do not support - * sampling and per-thread events. + * that share the same NB / Last level cache. On family 16h and below, + * Interrupts can be directed to a single target core, however, event + * counts generated by processes running on other cores cannot be masked + * out. So we do not support sampling and per-thread events via + * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: */ - if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) - return -EINVAL; - - /* and we do not enable counter overflow interrupts */ hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; @@ -206,18 +228,11 @@ static int amd_uncore_event_init(struct perf_event *event) return -EINVAL; /* - * SliceMask and ThreadMask need to be set for certain L3 events in - * Family 17h. For other events, the two fields do not affect the count. + * SliceMask and ThreadMask need to be set for certain L3 events. + * For other events, the two fields do not affect the count. */ - if (l3_mask && is_llc_event(event)) { - int thread = 2 * (cpu_data(event->cpu).cpu_core_id % 4); - - if (smp_num_siblings > 1) - thread += cpu_data(event->cpu).apicid & 1; - - hwc->config |= (1ULL << (AMD64_L3_THREAD_SHIFT + thread) & - AMD64_L3_THREAD_MASK) | AMD64_L3_SLICE_MASK; - } + if (l3_mask && is_llc_event(event)) + hwc->config |= l3_thread_slice_mask(event->cpu); uncore = event_to_amd_uncore(event); if (!uncore) @@ -306,7 +321,7 @@ static struct pmu amd_nb_pmu = { .start = amd_uncore_start, .stop = amd_uncore_stop, .read = amd_uncore_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, }; static struct pmu amd_llc_pmu = { @@ -317,7 +332,7 @@ static struct pmu amd_llc_pmu = { .start = amd_uncore_start, .stop = amd_uncore_stop, .read = amd_uncore_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, }; static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) @@ -523,9 +538,9 @@ static int __init amd_uncore_init(void) if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; - if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) { + if (boot_cpu_data.x86 >= 0x17) { /* - * For F17h or F18h, the Northbridge counters are + * For F17h and above, the Northbridge counters are * repurposed as Data Fabric counters. Also, L3 * counters are supported too. The PMUs are exported * based on family as either L2 or L3 and NB or DF. diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3bb738f5a472..a619763e96e1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2490,7 +2490,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; - const void __user *fp; + const struct stack_frame_ia32 __user *fp; if (!test_thread_flag(TIF_IA32)) return 0; @@ -2501,18 +2501,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); while (entry->nr < entry->max_stack) { - unsigned long bytes; - frame.next_frame = 0; - frame.return_address = 0; - if (!valid_user_frame(fp, sizeof(frame))) break; - bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4); - if (bytes != 0) + if (__get_user(frame.next_frame, &fp->next_frame)) break; - bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4); - if (bytes != 0) + if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); @@ -2533,7 +2527,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; - const unsigned long __user *fp; + const struct stack_frame __user *fp; if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ @@ -2546,7 +2540,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; - fp = (unsigned long __user *)regs->bp; + fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -2558,19 +2552,12 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs pagefault_disable(); while (entry->nr < entry->max_stack) { - unsigned long bytes; - - frame.next_frame = NULL; - frame.return_address = 0; - if (!valid_user_frame(fp, sizeof(frame))) break; - bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); - if (bytes != 0) + if (__get_user(frame.next_frame, &fp->next_frame)) break; - bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp)); - if (bytes != 0) + if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3be51aa06e67..332954cccece 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1945,6 +1945,14 @@ static __initconst const u64 knl_hw_cache_extra_regs * intel_bts events don't coexist with intel PMU's BTS events because of * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them * disabled around intel PMU's event batching etc, only inside the PMI handler. + * + * Avoid PEBS_ENABLE MSR access in PMIs. + * The GLOBAL_CTRL has been disabled. All the counters do not count anymore. + * It doesn't matter if the PEBS is enabled or not. + * Usually, the PEBS status are not changed in PMIs. It's unnecessary to + * access PEBS_ENABLE MSR in disable_all()/enable_all(). + * However, there are some cases which may change PEBS status, e.g. PMI + * throttle. The PEBS_ENABLE should be updated where the status changes. */ static void __intel_pmu_disable_all(void) { @@ -1954,13 +1962,12 @@ static void __intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); - - intel_pmu_pebs_disable_all(); } static void intel_pmu_disable_all(void) { __intel_pmu_disable_all(); + intel_pmu_pebs_disable_all(); intel_pmu_lbr_disable_all(); } @@ -1968,7 +1975,6 @@ static void __intel_pmu_enable_all(int added, bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(pmi); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -1986,6 +1992,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) static void intel_pmu_enable_all(int added) { + intel_pmu_pebs_enable_all(); __intel_pmu_enable_all(added, false); } @@ -2374,9 +2381,21 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * PEBS overflow sets bit 62 in the global status register */ if (__test_and_clear_bit(62, (unsigned long *)&status)) { + u64 pebs_enabled = cpuc->pebs_enabled; + handled++; x86_pmu.drain_pebs(regs); status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; + + /* + * PMI throttle may be triggered, which stops the PEBS event. + * Although cpuc->pebs_enabled is updated accordingly, the + * MSR_IA32_PEBS_ENABLE is not updated. Because the + * cpuc->enabled has been forced to 0 in PMI. + * Update the MSR if pebs_enabled is changed. + */ + if (pebs_enabled != cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); } /* @@ -4765,6 +4784,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_TREMONT_D: + case INTEL_FAM6_ATOM_TREMONT: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e1daf4151e11..e4aa20c0426f 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,17 +40,18 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM,CNL + * Available model: SLM,AMT,GLM,CNL,TNT * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM, - * CNL,KBL,CML + * CNL,KBL,CML,TNT * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, + * TNT * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -60,17 +61,18 @@ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,TNT * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL + * GLM,CNL,KBL,CML,ICL,TGL,TNT * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, + * TNT * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -87,7 +89,8 @@ * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, + * TNT * Scope: Package (physical package) * */ @@ -591,62 +594,60 @@ static const struct cstate_model glm_cstates __initconst = { }; -#define X86_CSTATES_MODEL(model, states) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } - static const struct x86_cpu_id intel_cstates_match[] __initconst = { - X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EP, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EX, nhm_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EP, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EX, nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE_X, snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_G, snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_L, hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_D, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_D, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_G, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &slm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D, &slm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_L, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE_L, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE, hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &cnl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_D, glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE_L, icl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE, icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4b94ae4ae369..dc43cc124e09 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1714,6 +1714,8 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) old = ((s64)(prev_raw_count << shift) >> shift); local64_add(new - old + count * period, &event->count); + local64_set(&hwc->period_left, -new); + perf_event_update_userpage(event); return 0; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 534c76606049..65113b16804a 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -585,6 +585,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.hw_idx = tos; } /* @@ -680,6 +681,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) out++; } cpuc->lbr_stack.nr = out; + cpuc->lbr_stack.hw_idx = tos; } void intel_pmu_lbr_read(void) @@ -1120,6 +1122,13 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) int i; cpuc->lbr_stack.nr = x86_pmu.lbr_nr; + + /* Cannot get TOS for large PEBS */ + if (cpuc->n_pebs == cpuc->n_large_pebs) + cpuc->lbr_stack.hw_idx = -1ULL; + else + cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { u64 info = lbr->lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 09913121e726..a5dbd25852cb 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -668,9 +668,6 @@ static int __init init_rapl_pmus(void) return 0; } -#define X86_RAPL_MODEL_MATCH(model, init) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } - static struct rapl_model model_snb = { .events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | @@ -716,36 +713,35 @@ static struct rapl_model model_skl = { }; static const struct x86_cpu_id rapl_model_match[] __initconst = { - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, model_snb), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE, model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &model_snbep), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &model_knl), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &model_knl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), {}, }; - MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 86467f85c383..1ba72c563313 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1392,10 +1392,6 @@ err: return ret; } - -#define X86_UNCORE_MODEL_MATCH(model, init) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } - struct intel_uncore_init_fun { void (*cpu_init)(void); int (*pci_init)(void); @@ -1470,6 +1466,16 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun tgl_uncore_init __initconst = { + .cpu_init = icl_uncore_cpu_init, + .mmio_init = tgl_uncore_mmio_init, +}; + +static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { + .cpu_init = icl_uncore_cpu_init, + .mmio_init = tgl_l_uncore_mmio_init, +}; + static const struct intel_uncore_init_fun snr_uncore_init __initconst = { .cpu_init = snr_uncore_cpu_init, .pci_init = snr_uncore_pci_init, @@ -1477,38 +1483,39 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = { }; static const struct x86_cpu_id intel_uncore_match[] __initconst = { - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_L, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_G, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL, bdw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, bdw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, bdx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_D, snr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &ivb_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &hsw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hsw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &hsw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &bdw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &bdw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snbep_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhmex_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhmex_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ivbep_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &hswep_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &bdx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &bdx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; - MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); static int __init intel_uncore_init(void) diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index bbfdaa720b45..b30429f8a53a 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -154,6 +154,7 @@ struct freerunning_counters { unsigned int box_offset; unsigned int num_counters; unsigned int bits; + unsigned *box_offsets; }; struct pci2phy_map { @@ -310,7 +311,9 @@ unsigned int uncore_freerunning_counter(struct intel_uncore_box *box, return pmu->type->freerunning[type].counter_base + pmu->type->freerunning[type].counter_offset * idx + - pmu->type->freerunning[type].box_offset * pmu->pmu_idx; + (pmu->type->freerunning[type].box_offsets ? + pmu->type->freerunning[type].box_offsets[pmu->pmu_idx] : + pmu->type->freerunning[type].box_offset * pmu->pmu_idx); } static inline @@ -527,6 +530,8 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void tgl_uncore_mmio_init(void); +void tgl_l_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index c37cb12d0ef6..3de1065eefc4 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -44,6 +44,11 @@ #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 +#define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 +#define PCI_DEVICE_ID_INTEL_TGL_U2_IMC 0x9a04 +#define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12 +#define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14 +#define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 /* SNB event control */ @@ -1002,3 +1007,157 @@ void nhm_uncore_cpu_init(void) } /* end of Nehalem uncore support */ + +/* Tiger Lake MMIO uncore support */ + +static const struct pci_device_id tgl_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ } +}; + +enum perf_tgl_uncore_imc_freerunning_types { + TGL_MMIO_UNCORE_IMC_DATA_TOTAL, + TGL_MMIO_UNCORE_IMC_DATA_READ, + TGL_MMIO_UNCORE_IMC_DATA_WRITE, + TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX +}; + +static struct freerunning_counters tgl_l_uncore_imc_freerunning[] = { + [TGL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x5040, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_READ] = { 0x5058, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0x50A0, 0x0, 0x0, 1, 64 }, +}; + +static struct freerunning_counters tgl_uncore_imc_freerunning[] = { + [TGL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0xd840, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_READ] = { 0xd858, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xd8A0, 0x0, 0x0, 1, 64 }, +}; + +static struct uncore_event_desc tgl_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_total, "event=0xff,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(data_total.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_total.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_read, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(data_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_read.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_write, "event=0xff,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(data_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_write.unit, "MiB"), + + { /* end: all zeroes */ } +}; + +static struct pci_dev *tgl_uncore_get_mc_dev(void) +{ + const struct pci_device_id *ids = tgl_uncore_pci_ids; + struct pci_dev *mc_dev = NULL; + + while (ids && ids->vendor) { + mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, ids->device, NULL); + if (mc_dev) + return mc_dev; + ids++; + } + + return mc_dev; +} + +#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 + +static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = tgl_uncore_get_mc_dev(); + struct intel_uncore_pmu *pmu = box->pmu; + resource_size_t addr; + u32 mch_bar; + + if (!pdev) { + pr_warn("perf uncore: Cannot find matched IMC device.\n"); + return; + } + + pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET, &mch_bar); + /* MCHBAR is disabled */ + if (!(mch_bar & BIT(0))) { + pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n"); + return; + } + mch_bar &= ~BIT(0); + addr = (resource_size_t)(mch_bar + TGL_UNCORE_MMIO_IMC_MEM_OFFSET * pmu->pmu_idx); + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET + 4, &mch_bar); + addr |= ((resource_size_t)mch_bar << 32); +#endif + + box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); +} + +static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { + .init_box = tgl_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct attribute *tgl_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + NULL +}; + +static const struct attribute_group tgl_uncore_imc_format_group = { + .name = "format", + .attrs = tgl_uncore_imc_formats_attr, +}; + +static struct intel_uncore_type tgl_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 2, + .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .freerunning = tgl_uncore_imc_freerunning, + .ops = &tgl_uncore_imc_freerunning_ops, + .event_descs = tgl_uncore_imc_events, + .format_group = &tgl_uncore_imc_format_group, +}; + +static struct intel_uncore_type *tgl_mmio_uncores[] = { + &tgl_uncore_imc_free_running, + NULL +}; + +void tgl_l_uncore_mmio_init(void) +{ + tgl_uncore_imc_free_running.freerunning = tgl_l_uncore_imc_freerunning; + uncore_mmio_uncores = tgl_mmio_uncores; +} + +void tgl_uncore_mmio_init(void) +{ + uncore_mmio_uncores = tgl_mmio_uncores; +} + +/* end of Tiger Lake MMIO uncore support */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ad20220af303..01023f0d935b 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4380,10 +4380,10 @@ static struct pci_dev *snr_uncore_get_mc_dev(int id) return mc_dev; } -static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) +static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, + unsigned int box_ctl, int mem_offset) { struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); - unsigned int box_ctl = uncore_mmio_box_ctl(box); resource_size_t addr; u32 pci_dword; @@ -4393,7 +4393,7 @@ static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword); addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; - pci_read_config_dword(pdev, SNR_IMC_MMIO_MEM0_OFFSET, &pci_dword); + pci_read_config_dword(pdev, mem_offset, &pci_dword); addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12; addr += box_ctl; @@ -4405,6 +4405,12 @@ static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); } +static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + __snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), + SNR_IMC_MMIO_MEM0_OFFSET); +} + static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box) { u32 config; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 6f86650b3f77..a949f6f55991 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -75,8 +75,9 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_ATOM_GOLDMONT_D: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + case INTEL_FAM6_ATOM_TREMONT_D: + case INTEL_FAM6_ATOM_TREMONT: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index d13b352b2aa7..8e4d0391ff6c 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a3aefe9b9401..f9d8804144d0 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -36,70 +36,56 @@ #include <asm/sighandling.h> #include <asm/smap.h> +static inline void reload_segments(struct sigcontext_32 *sc) +{ + unsigned int cur; + + savesegment(gs, cur); + if ((sc->gs | 0x03) != cur) + load_gs_index(sc->gs | 0x03); + savesegment(fs, cur); + if ((sc->fs | 0x03) != cur) + loadsegment(fs, sc->fs | 0x03); + savesegment(ds, cur); + if ((sc->ds | 0x03) != cur) + loadsegment(ds, sc->ds | 0x03); + savesegment(es, cur); + if ((sc->es | 0x03) != cur) + loadsegment(es, sc->es | 0x03); +} + /* * Do a signal return; undo the signal stack. */ -#define loadsegment_gs(v) load_gs_index(v) -#define loadsegment_fs(v) loadsegment(fs, v) -#define loadsegment_ds(v) loadsegment(ds, v) -#define loadsegment_es(v) loadsegment(es, v) - -#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) -#define set_user_seg(seg, v) loadsegment_##seg(v) - -#define COPY(x) { \ - get_user_ex(regs->x, &sc->x); \ -} - -#define GET_SEG(seg) ({ \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - tmp; \ -}) - -#define COPY_SEG_CPL3(seg) do { \ - regs->seg = GET_SEG(seg) | 3; \ -} while (0) - -#define RELOAD_SEG(seg) { \ - unsigned int pre = (seg) | 3; \ - unsigned int cur = get_user_seg(seg); \ - if (pre != cur) \ - set_user_seg(seg, pre); \ -} - static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_32 __user *sc) + struct sigcontext_32 __user *usc) { - unsigned int tmpflags, err = 0; - u16 gs, fs, es, ds; - void __user *buf; - u32 tmp; + struct sigcontext_32 sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - get_user_try { - gs = GET_SEG(gs); - fs = GET_SEG(fs); - ds = GET_SEG(ds); - es = GET_SEG(es); - - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); COPY(ax); - /* Don't touch extended registers */ - - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); - - get_user_ex(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - /* disable syscall checks */ - regs->orig_ax = -1; + if (unlikely(copy_from_user(&sc, usc, sizeof(sc)))) + return -EFAULT; - get_user_ex(tmp, &sc->fpstate); - buf = compat_ptr(tmp); - } get_user_catch(err); + /* Get only the ia32 registers. */ + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; /* * Reload fs and gs if they have changed in the signal @@ -107,14 +93,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, * the handler, but does not clobber them at least in the * normal case. */ - RELOAD_SEG(gs); - RELOAD_SEG(fs); - RELOAD_SEG(ds); - RELOAD_SEG(es); - - err |= fpu__restore_sig(buf, 1); - - return err; + reload_segments(&sc); + return fpu__restore_sig(compat_ptr(sc.fpstate), 1); } COMPAT_SYSCALL_DEFINE0(sigreturn) @@ -126,10 +106,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) - || (_COMPAT_NSIG_WORDS > 1 - && __copy_from_user((((char *) &set.sig) + 4), - &frame->extramask, - sizeof(frame->extramask)))) + || __get_user(((__u32 *)&set)[1], &frame->extramask[0])) goto badframe; set_current_blocked(&set); @@ -153,7 +130,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; set_current_blocked(&set); @@ -175,44 +152,51 @@ badframe: * Set up a signal frame. */ -static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc, - void __user *fpstate, - struct pt_regs *regs, unsigned int mask) +#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) + +static __always_inline int +__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc, + void __user *fpstate, + struct pt_regs *regs, unsigned int mask) { - int err = 0; - - put_user_try { - put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs); - put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs); - put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds); - put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es); - - put_user_ex(regs->di, &sc->di); - put_user_ex(regs->si, &sc->si); - put_user_ex(regs->bp, &sc->bp); - put_user_ex(regs->sp, &sc->sp); - put_user_ex(regs->bx, &sc->bx); - put_user_ex(regs->dx, &sc->dx); - put_user_ex(regs->cx, &sc->cx); - put_user_ex(regs->ax, &sc->ax); - put_user_ex(current->thread.trap_nr, &sc->trapno); - put_user_ex(current->thread.error_code, &sc->err); - put_user_ex(regs->ip, &sc->ip); - put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->sp, &sc->sp_at_signal); - put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); - - put_user_ex(ptr_to_compat(fpstate), &sc->fpstate); - - /* non-iBCS2 extensions.. */ - put_user_ex(mask, &sc->oldmask); - put_user_ex(current->thread.cr2, &sc->cr2); - } put_user_catch(err); - - return err; + unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault); + unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault); + unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault); + + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); + + unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; + +Efault: + return -EFAULT; } +#define unsafe_put_sigcontext32(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0) + /* * Determine which stack to use.. */ @@ -252,8 +236,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, { struct sigframe_ia32 __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; + void __user *fp = NULL; /* copy_to_user optimizes that into a single 8 byte store */ static const struct { @@ -266,22 +249,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, 0x80cd, /* int $0x80 */ }; - frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); - - if (!access_ok(frame, sizeof(*frame))) - return -EFAULT; - - if (__put_user(sig, &frame->sig)) - return -EFAULT; - - if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) - return -EFAULT; - - if (_COMPAT_NSIG_WORDS > 1) { - if (__copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask))) - return -EFAULT; - } + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); if (ksig->ka.sa.sa_flags & SA_RESTORER) { restorer = ksig->ka.sa.sa_restorer; @@ -294,19 +262,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, restorer = &frame->retcode; } - put_user_try { - put_user_ex(ptr_to_compat(restorer), &frame->pretcode); - - /* - * These are actually not used anymore, but left because some - * gdb versions depend on them as a marker. - */ - put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); - } put_user_catch(err); - - if (err) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); + unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); + /* + * These are actually not used anymore, but left because some + * gdb versions depend on them as a marker. + */ + unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); + user_access_end(); + /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; @@ -323,6 +292,9 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, regs->ss = __USER32_DS; return 0; +Efault: + user_access_end(); + return -EFAULT; } int ia32_setup_rt_frame(int sig, struct ksignal *ksig, @@ -330,10 +302,9 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe_ia32 __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; + void __user *fp = NULL; - /* __copy_to_user optimizes that into a single 8 byte store */ + /* unsafe_put_user optimizes that into a single 8 byte store */ static const struct { u8 movl; u32 val; @@ -346,44 +317,40 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, 0, }; - frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - put_user_try { - put_user_ex(sig, &frame->sig); - put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo); - put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault); + unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault); - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - else - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - put_user_ex(ptr_to_compat(restorer), &frame->pretcode); - - /* - * Not actually used anymore, but left because some gdb - * versions need it. - */ - put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); - } put_user_catch(err); - - err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false); - err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + else + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); + + /* + * Not actually used anymore, but left because some gdb + * versions need it. + */ + unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); + unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); + user_access_end(); + + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false)) return -EFAULT; /* Set up registers for signal handler */ @@ -402,4 +369,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, regs->ss = __USER32_DS; return 0; +Efault: + user_access_end(); + return -EFAULT; } diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 1ae4e5791afa..c7df20e78b09 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -12,7 +12,6 @@ struct amd_nb_bus_dev_range { u8 dev_limit; }; -extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; extern bool early_is_amd_nb(u32 value); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index cd339b88d5d4..0f63585edf5f 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -138,9 +138,6 @@ # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) -# define _ASM_EXTABLE_EX(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) - # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -166,9 +163,6 @@ # define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) -# define _ASM_EXTABLE_EX(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) - /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index dc4cfc888d6d..dc9dc7b3911a 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -4,14 +4,18 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */ -#define VCLOCK_HVCLOCK 3 /* vDSO should use vread_hvclock. */ -#define VCLOCK_MAX 3 +#include <asm/vdso/clocksource.h> -struct arch_clocksource_data { - int vclock_mode; -}; +extern unsigned int vclocks_used; + +static inline bool vclock_was_used(int vclock) +{ + return READ_ONCE(vclocks_used) & (1U << vclock); +} + +static inline void vclocks_set_used(unsigned int which) +{ + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << which)); +} #endif /* _ASM_X86_CLOCKSOURCE_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index adc6cc86b062..ff6f3ca649b3 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -40,4 +40,16 @@ int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); +#ifdef CONFIG_CPU_SUP_INTEL +extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c); +extern void switch_to_sld(unsigned long tifn); +extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); +#else +static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {} +static inline void switch_to_sld(unsigned long tifn) {} +static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + return false; +} +#endif #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 31c379c1da41..cf3d621c6892 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -5,9 +5,139 @@ /* * Declare drivers belonging to specific x86 CPUs * Similar in spirit to pci_device_id and related PCI functions + * + * The wildcard initializers are in mod_devicetable.h because + * file2alias needs them. Sigh. */ - #include <linux/mod_devicetable.h> +/* Get the INTEL_FAM* model defines */ +#include <asm/intel-family.h> +/* And the X86_VENDOR_* ones */ +#include <asm/processor.h> + +/* Centaur FAM6 models */ +#define X86_CENTAUR_FAM6_C7_A 0xa +#define X86_CENTAUR_FAM6_C7_D 0xd +#define X86_CENTAUR_FAM6_NANO 0xf + +/** + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Base macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * Use only if you need all selectors. Otherwise use one of the shorter + * macros of the X86_MATCH_* family. If there is no matching shorthand + * macro, consider to add one. If you really need to wrap one of the macros + * into another macro at the usage site for good reasons, then please + * start this local macro with X86_MATCH to allow easy grepping. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(_vendor, _family, _model, \ + _feature, _data) { \ + .vendor = X86_VENDOR_##_vendor, \ + .family = _family, \ + .model = _model, \ + .feature = _feature, \ + .driver_data = (unsigned long) _data \ +} + +/** + * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @family: The family number or X86_FAMILY_ANY + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \ + X86_MODEL_ANY, feature, data) + +/** + * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ + X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data) + +/** + * X86_MATCH_FEATURE - Macro for matching a CPU feature + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_FEATURE(feature, data) \ + X86_MATCH_VENDOR_FEATURE(ANY, feature, data) + +/** + * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @family: The family number or X86_FAMILY_ANY + * @model: The model number, model constant or X86_MODEL_ANY + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \ + X86_FEATURE_ANY, data) + +/** + * X86_MATCH_VENDOR_FAM - Match vendor and family + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @family: The family number or X86_FAMILY_ANY + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set of wildcards. + */ +#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) + +/** + * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model + * @model: The model name without the INTEL_FAM6_ prefix or ANY + * The model name is expanded to INTEL_FAM6_@model internally + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The vendor is set to INTEL, the family to 6 and all other missing + * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. + * + * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. + */ +#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) /* * Match specific microcode revisions. diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f3327cb56edf..db189945e9b0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -217,7 +217,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -285,6 +285,7 @@ #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ @@ -299,6 +300,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ @@ -367,6 +369,7 @@ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ /* diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index ae391f609840..f71a0cce9373 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -42,8 +42,8 @@ * Emit CFI data in .debug_frame sections, not .eh_frame sections. * The latter we currently just discard since we don't do DWARF * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if - * vmlinux.lds.S gets changed so it doesn't discard .eh_frame. + * useful to anyone. Note we should not use this directive if we + * ever decide to enable DWARF unwinding at runtime. */ .cfi_sections .debug_frame #else diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 86169a24b0d8..cdcf48d52a12 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -10,6 +10,8 @@ #include <asm/mmu_context.h> #include <linux/build_bug.h> +extern unsigned long efi_fw_vendor, efi_config_table; + /* * We map the EFI regions needed for runtime services non-contiguously, * with preserved alignment on virtual addresses starting from -4G down @@ -34,8 +36,6 @@ static inline bool efi_have_uv1_memmap(void) #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" -#define MAX_CMDLINE_ADDRESS UINT_MAX - #define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF /* @@ -180,7 +180,6 @@ extern void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd); struct efi_setup_data { u64 fw_vendor; - u64 runtime; u64 tables; u64 smbios; u64 reserved[8]; @@ -219,7 +218,8 @@ extern void efi_thunk_runtime_setup(void); efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, unsigned long descriptor_size, u32 descriptor_version, - efi_memory_desc_t *virtual_map); + efi_memory_desc_t *virtual_map, + unsigned long systab_phys); /* arch specific definitions used by the stub code */ @@ -270,6 +270,11 @@ static inline void *efi64_zero_upper(void *p) return p; } +static inline u32 efi64_convert_status(efi_status_t status) +{ + return (u32)(status | (u64)status >> 32); +} + #define __efi64_argmap_free_pages(addr, size) \ ((addr), 0, (size)) @@ -285,11 +290,21 @@ static inline void *efi64_zero_upper(void *p) #define __efi64_argmap_locate_protocol(protocol, reg, interface) \ ((protocol), (reg), efi64_zero_upper(interface)) +#define __efi64_argmap_locate_device_path(protocol, path, handle) \ + ((protocol), (path), efi64_zero_upper(handle)) + +#define __efi64_argmap_exit(handle, status, size, data) \ + ((handle), efi64_convert_status(status), (size), (data)) + /* PCI I/O */ #define __efi64_argmap_get_location(protocol, seg, bus, dev, func) \ ((protocol), efi64_zero_upper(seg), efi64_zero_upper(bus), \ efi64_zero_upper(dev), efi64_zero_upper(func)) +/* LoadFile */ +#define __efi64_argmap_load_file(protocol, path, policy, bufsize, buf) \ + ((protocol), (path), (policy), efi64_zero_upper(bufsize), (buf)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 13c83fe97988..f9c00110a69a 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -12,76 +12,103 @@ #include <asm/processor.h> #include <asm/smap.h> -#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ - asm volatile("\t" ASM_STAC "\n" \ - "1:\t" insn "\n" \ - "2:\t" ASM_CLAC "\n" \ +#define unsafe_atomic_op1(insn, oval, uaddr, oparg, label) \ +do { \ + int oldval = 0, ret; \ + asm volatile("1:\t" insn "\n" \ + "2:\n" \ "\t.section .fixup,\"ax\"\n" \ "3:\tmov\t%3, %1\n" \ "\tjmp\t2b\n" \ "\t.previous\n" \ _ASM_EXTABLE_UA(1b, 3b) \ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ - : "i" (-EFAULT), "0" (oparg), "1" (0)) + : "i" (-EFAULT), "0" (oparg), "1" (0)); \ + if (ret) \ + goto label; \ + *oval = oldval; \ +} while(0) -#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ - asm volatile("\t" ASM_STAC "\n" \ - "1:\tmovl %2, %0\n" \ - "\tmovl\t%0, %3\n" \ + +#define unsafe_atomic_op2(insn, oval, uaddr, oparg, label) \ +do { \ + int oldval = 0, ret, tem; \ + asm volatile("1:\tmovl %2, %0\n" \ + "2:\tmovl\t%0, %3\n" \ "\t" insn "\n" \ - "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ - "\tjnz\t1b\n" \ - "3:\t" ASM_CLAC "\n" \ + "3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ + "\tjnz\t2b\n" \ + "4:\n" \ "\t.section .fixup,\"ax\"\n" \ - "4:\tmov\t%5, %1\n" \ - "\tjmp\t3b\n" \ + "5:\tmov\t%5, %1\n" \ + "\tjmp\t4b\n" \ "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 4b) \ - _ASM_EXTABLE_UA(2b, 4b) \ + _ASM_EXTABLE_UA(1b, 5b) \ + _ASM_EXTABLE_UA(3b, 5b) \ : "=&a" (oldval), "=&r" (ret), \ "+m" (*uaddr), "=&r" (tem) \ - : "r" (oparg), "i" (-EFAULT), "1" (0)) + : "r" (oparg), "i" (-EFAULT), "1" (0)); \ + if (ret) \ + goto label; \ + *oval = oldval; \ +} while(0) -static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, +static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int oldval = 0, ret, tem; - - pagefault_disable(); + if (!user_access_begin(uaddr, sizeof(u32))) + return -EFAULT; switch (op) { case FUTEX_OP_SET: - __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg); + unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault); break; case FUTEX_OP_ADD: - __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval, - uaddr, oparg); + unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, + uaddr, oparg, Efault); break; case FUTEX_OP_OR: - __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg); + unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault); break; case FUTEX_OP_ANDN: - __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg); + unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault); break; case FUTEX_OP_XOR: - __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg); + unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault); break; default: - ret = -ENOSYS; + user_access_end(); + return -ENOSYS; } - - pagefault_enable(); - - if (!ret) - *oval = oldval; - - return ret; + user_access_end(); + return 0; +Efault: + user_access_end(); + return -EFAULT; } static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - return user_atomic_cmpxchg_inatomic(uval, uaddr, oldval, newval); + int ret = 0; + + if (!user_access_begin(uaddr, sizeof(u32))) + return -EFAULT; + asm volatile("\n" + "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" + "2:\n" + "\t.section .fixup, \"ax\"\n" + "3:\tmov %3, %0\n" + "\tjmp 2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "1" (oldval) + : "memory" + ); + user_access_end(); + *uval = oldval; + return ret; } #endif diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 4981c293f926..8f1e94f29a16 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -35,6 +35,9 @@ * The #define line may optionally include a comment including platform names. */ +/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ +#define INTEL_FAM6_ANY X86_MODEL_ANY + #define INTEL_FAM6_CORE_YONAH 0x0E #define INTEL_FAM6_CORE2_MEROM 0x0F @@ -118,17 +121,7 @@ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ -/* Useful macros */ -#define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ -{ \ - .vendor = X86_VENDOR_INTEL, \ - .family = _family, \ - .model = _model, \ - .feature = X86_FEATURE_ANY, \ - .driver_data = (kernel_ulong_t)&_driver_data \ -} - -#define INTEL_CPU_FAM6(_model, _driver_data) \ - INTEL_CPU_FAM_ANY(6, INTEL_FAM6_##_model, _driver_data) +/* Family 5 */ +#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h index 02c6ef8f7667..07344d82e88e 100644 --- a/arch/x86/include/asm/io_bitmap.h +++ b/arch/x86/include/asm/io_bitmap.h @@ -19,7 +19,14 @@ struct task_struct; void io_bitmap_share(struct task_struct *tsk); void io_bitmap_exit(void); -void tss_update_io_bitmap(void); +void native_tss_update_io_bitmap(void); + +#ifdef CONFIG_PARAVIRT_XXL +#include <asm/paravirt.h> +#else +#define tss_update_io_bitmap native_tss_update_io_bitmap +#endif + #else static inline void io_bitmap_share(struct task_struct *tsk) { } static inline void io_bitmap_exit(void) { } diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index a176f6165d85..72fba0eeeb30 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,7 +36,7 @@ extern void native_init_IRQ(void); extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible unsigned int do_IRQ(struct pt_regs *regs); +extern __visible void do_IRQ(struct pt_regs *regs); extern void init_ISA_irqs(void); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 95b1f053bd96..073eb7ad2f56 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -36,6 +36,7 @@ typedef u8 kprobe_opcode_t; /* optinsn template addresses */ extern __visible kprobe_opcode_t optprobe_template_entry[]; +extern __visible kprobe_opcode_t optprobe_template_clac[]; extern __visible kprobe_opcode_t optprobe_template_val[]; extern __visible kprobe_opcode_t optprobe_template_call[]; extern __visible kprobe_opcode_t optprobe_template_end[]; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 03946eb3e2b9..c06e8353efd3 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -292,6 +292,14 @@ enum x86emul_mode { #define X86EMUL_SMM_MASK (1 << 6) #define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) +/* + * fastop functions are declared as taking a never-defined fastop parameter, + * so they can't be called from C directly. + */ +struct fastop; + +typedef void (*fastop_t)(struct fastop *); + struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; @@ -324,7 +332,10 @@ struct x86_emulate_ctxt { struct operand src; struct operand src2; struct operand dst; - int (*execute)(struct x86_emulate_ctxt *ctxt); + union { + int (*execute)(struct x86_emulate_ctxt *ctxt); + fastop_t fop; + }; int (*check_perm)(struct x86_emulate_ctxt *ctxt); /* * The following six fields are cleared together, @@ -349,7 +360,6 @@ struct x86_emulate_ctxt { u64 d; unsigned long _eip; struct operand memop; - /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; struct fetch_cache fetch; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4dffbc10d3f8..98959e8cd448 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -781,9 +781,19 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; /* - * Indicate whether the access faults on its page table in guest - * which is set when fix page fault and used to detect unhandeable - * instruction. + * Indicates the guest is trying to write a gfn that contains one or + * more of the PTEs used to translate the write itself, i.e. the access + * is changing its own translation in the guest page tables. KVM exits + * to userspace if emulation of the faulting instruction fails and this + * flag is set, as KVM cannot make forward progress. + * + * If emulation fails for a write to guest page tables, KVM unprotects + * (zaps) the shadow page for the target gfn and resumes the guest to + * retry the non-emulatable instruction (on hardware). Unprotecting the + * gfn doesn't allow forward progress for a self-changing access because + * doing so also zaps the translation for the gfn, i.e. retrying the + * instruction will hit a !PRESENT fault, which results in a new shadow + * page and sends KVM back to square one. */ bool write_fault_to_shadow_pgtable; @@ -1112,6 +1122,7 @@ struct kvm_x86_ops { int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*update_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, @@ -1136,7 +1147,7 @@ struct kvm_x86_ops { void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); - void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4359b955e0b7..f9cea081c05b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -102,7 +102,7 @@ #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ -#define MCE_LOG_LEN 32 +#define MCE_LOG_MIN_LEN 32U #define MCE_LOG_SIGNATURE "MACHINECHECK" /* AMD Scalable MCA */ @@ -135,11 +135,11 @@ */ struct mce_log_buffer { char signature[12]; /* "MACHINECHECK" */ - unsigned len; /* = MCE_LOG_LEN */ + unsigned len; /* = elements in .mce_entry[] */ unsigned next; unsigned flags; unsigned recordlen; /* length of struct mce */ - struct mce entry[MCE_LOG_LEN]; + struct mce entry[]; }; enum mce_notifier_prios { @@ -238,9 +238,6 @@ extern void mce_disable_bank(int bank); /* * Exception handler */ - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); void do_machine_check(struct pt_regs *, long); /* diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b79515abb82..edc2c581704a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -46,7 +46,9 @@ typedef int (*hyperv_fill_flush_list_func)( #define hv_set_reference_tsc(val) \ wrmsrl(HV_X64_MSR_REFERENCE_TSC, val) #define hv_set_clocksource_vdso(val) \ - ((val).archdata.vclock_mode = VCLOCK_HVCLOCK) + ((val).vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK) +#define hv_enable_vdso_clocksource() \ + vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); #define hv_get_raw_timer() rdtsc_ordered() void hyperv_callback_vector(void); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ebe1685e92dd..12c9684d59ba 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -41,6 +41,10 @@ /* Intel MSRs. Some also available on other CPUs */ +#define MSR_TEST_CTRL 0x00000033 +#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 +#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) + #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ #define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ #define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ @@ -70,6 +74,11 @@ */ #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) +/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ +#define MSR_IA32_CORE_CAPS 0x000000cf +#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5 +#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT) + #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) @@ -512,6 +521,8 @@ #define MSR_K7_HWCR 0xc0010015 #define MSR_K7_HWCR_SMMLOCK_BIT 0 #define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) +#define MSR_K7_HWCR_IRPERF_EN_BIT 30 +#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 9d5252c9685c..b809f117f3f4 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -23,6 +23,8 @@ #define MWAITX_MAX_LOOPS ((u32)-1) #define MWAITX_DISABLE_CSTATES 0xf0 +u32 get_umwait_control_msr(void); + static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 86e7317eb31f..694d8daf4983 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -295,6 +295,13 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } +#ifdef CONFIG_X86_IOPL_IOPERM +static inline void tss_update_io_bitmap(void) +{ + PVOP_VCALL0(cpu.update_io_bitmap); +} +#endif + static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 84812964d3dd..732f62e04ddb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -140,6 +140,10 @@ struct pv_cpu_ops { void (*load_sp0)(unsigned long sp0); +#ifdef CONFIG_X86_IOPL_IOPERM + void (*update_io_bitmap)(void); +#endif + void (*wbinvd)(void); /* cpuid emulation, mostly so that caps bits can be disabled */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 29964b0e1075..e855e9cf2c37 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -50,11 +50,22 @@ #define AMD64_L3_SLICE_SHIFT 48 #define AMD64_L3_SLICE_MASK \ - ((0xFULL) << AMD64_L3_SLICE_SHIFT) + (0xFULL << AMD64_L3_SLICE_SHIFT) +#define AMD64_L3_SLICEID_MASK \ + (0x7ULL << AMD64_L3_SLICE_SHIFT) #define AMD64_L3_THREAD_SHIFT 56 #define AMD64_L3_THREAD_MASK \ - ((0xFFULL) << AMD64_L3_THREAD_SHIFT) + (0xFFULL << AMD64_L3_THREAD_SHIFT) +#define AMD64_L3_F19H_THREAD_MASK \ + (0x3ULL << AMD64_L3_THREAD_SHIFT) + +#define AMD64_L3_EN_ALL_CORES BIT_ULL(47) +#define AMD64_L3_EN_ALL_SLICES BIT_ULL(46) + +#define AMD64_L3_COREID_SHIFT 42 +#define AMD64_L3_COREID_MASK \ + (0x7ULL << AMD64_L3_COREID_SHIFT) #define X86_RAW_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_EVENT | \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7e118660bbd9..afda66a6d325 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -595,12 +595,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } -static inline pud_t pud_mknotpresent(pud_t pud) -{ - return pfn_pud(pud_pfn(pud), - __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); -} - static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -627,12 +621,15 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return __pmd(val); } -/* mprotect needs to preserve PAT bits when updating vm_page_prot */ +/* + * mprotect needs to preserve PAT and encryption bits when updating + * vm_page_prot + */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; - pgprotval_t addbits = pgprot_val(newprot); + pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK; return __pgprot(preservebits | addbits); } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 0239998d8cdc..65c2ecd730c5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -118,7 +118,7 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 19b137f1b3be..2ff9b98812b7 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -4,6 +4,11 @@ #define ARCH_DEFAULT_PKEY 0 +/* + * If more than 16 keys are ever supported, a thorough audit + * will be necessary to ensure that the types that store key + * numbers and masks have sufficient capacity. + */ #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 3d4cb83a8828..69485ca13665 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -103,14 +103,14 @@ static __always_inline bool should_resched(int preempt_offset) } #ifdef CONFIG_PREEMPTION - extern asmlinkage void ___preempt_schedule(void); + extern asmlinkage void preempt_schedule_thunk(void); # define __preempt_schedule() \ - asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) + asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); - extern asmlinkage void ___preempt_schedule_notrace(void); + extern asmlinkage void preempt_schedule_notrace_thunk(void); # define __preempt_schedule_notrace() \ - asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 09705ccc393c..3bcf27caf6c9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -26,6 +26,7 @@ struct vm86; #include <asm/fpu/types.h> #include <asm/unwind_hints.h> #include <asm/vmxfeatures.h> +#include <asm/vdso/processor.h> #include <linux/personality.h> #include <linux/cache.h> @@ -541,7 +542,6 @@ struct thread_struct { mm_segment_t addr_limit; unsigned int sig_on_uaccess_err:1; - unsigned int uaccess_err:1; /* uaccess failed */ /* Floating point and extended processor state */ struct fpu fpu; @@ -677,17 +677,6 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) -{ - asm volatile("rep; nop" ::: "memory"); -} - -static __always_inline void cpu_relax(void) -{ - rep_nop(); -} - /* * This function forces the icache and prefetched instruction stream to * catch up with reality in two very specific cases: diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 036c360910c5..a6e8373a5170 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_SECTIONS_H #define _ASM_X86_SECTIONS_H +#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed + #include <asm-generic/sections.h> #include <asm/extable.h> @@ -14,4 +16,22 @@ extern char __end_rodata_hpage_align[]; extern char __end_of_kernel_reserve[]; +extern unsigned long _brk_start, _brk_end; + +static inline bool arch_is_kernel_initmem_freed(unsigned long addr) +{ + /* + * If _brk_start has not been cleared, brk allocation is incomplete, + * and we can not make assumptions about its use. + */ + if (_brk_start) + return 0; + + /* + * After brk allocation is complete, space between _brk_end and _end + * is available for allocation. + */ + return addr >= _brk_end && addr < (unsigned long)&_end; +} + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 64c3dce374e5..950532ccbc4a 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -46,6 +46,8 @@ int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_np_noalias(unsigned long addr, int numpages); +int set_memory_nonglobal(unsigned long addr, int numpages); +int set_memory_global(unsigned long addr, int numpages); int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index f176114c04d4..84eab2724875 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -33,11 +33,7 @@ struct sigframe_ia32 { * legacy application accessing/modifying it. */ struct _fpstate_32 fpstate_unused; -#ifdef CONFIG_IA32_EMULATION - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; -#else /* !CONFIG_IA32_EMULATION */ - unsigned long extramask[_NSIG_WORDS-1]; -#endif /* CONFIG_IA32_EMULATION */ + unsigned int extramask[1]; char retcode[8]; /* fp state follows here */ }; diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 2fcbd6f33ef7..65e667279e0f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -14,12 +14,5 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); -int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask); - - -#ifdef CONFIG_X86_X32_ABI -asmlinkage long sys32_x32_rt_sigreturn(void); -#endif #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 8db3fdb6102e..7cbf733d11af 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -13,23 +13,14 @@ #include <uapi/linux/audit.h> #include <linux/sched.h> #include <linux/err.h> -#include <asm/asm-offsets.h> /* For NR_syscalls */ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -#ifdef CONFIG_X86_64 -typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *); -#else -typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); -#endif /* CONFIG_X86_64 */ +typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) #define ia32_sys_call_table sys_call_table -#define __NR_syscall_compat_max __NR_syscall_max -#define IA32_NR_syscalls NR_syscalls #endif #if defined(CONFIG_IA32_EMULATION) @@ -168,6 +159,11 @@ static inline int syscall_get_arch(struct task_struct *task) task->thread_info.status & TS_COMPAT) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } + +void do_syscall_64(unsigned long nr, struct pt_regs *regs); +void do_int80_syscall_32(struct pt_regs *regs); +long do_fast_syscall_32(struct pt_regs *regs); + #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index e2389ce9bf58..a84333adeef2 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -8,6 +8,50 @@ struct pt_regs; +extern long __x64_sys_ni_syscall(const struct pt_regs *regs); +extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); + +/* + * Instead of the generic __SYSCALL_DEFINEx() definition, the x86 version takes + * struct pt_regs *regs as the only argument of the syscall stub(s) named as: + * __x64_sys_*() - 64-bit native syscall + * __ia32_sys_*() - 32-bit native syscall or common compat syscall + * __ia32_compat_sys_*() - 32-bit compat syscall + * __x32_compat_sys_*() - 64-bit X32 compat syscall + * + * The registers are decoded according to the ABI: + * 64-bit: RDI, RSI, RDX, R10, R8, R9 + * 32-bit: EBX, ECX, EDX, ESI, EDI, EBP + * + * The stub then passes the decoded arguments to the __se_sys_*() wrapper to + * perform sign-extension (omitted for zero-argument syscalls). Finally the + * arguments are passed to the __do_sys_*() function which is the actual + * syscall. These wrappers are marked as inline so the compiler can optimize + * the functions where appropriate. + * + * Example assembly (slightly re-ordered for better readability): + * + * <__x64_sys_recv>: <-- syscall with 4 parameters + * callq <__fentry__> + * + * mov 0x70(%rdi),%rdi <-- decode regs->di + * mov 0x68(%rdi),%rsi <-- decode regs->si + * mov 0x60(%rdi),%rdx <-- decode regs->dx + * mov 0x38(%rdi),%rcx <-- decode regs->r10 + * + * xor %r9d,%r9d <-- clear %r9 + * xor %r8d,%r8d <-- clear %r8 + * + * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom() + * which takes 6 arguments + * + * cltq <-- extend return value to 64-bit + * retq <-- return + * + * This approach avoids leaking random user-provided register content down + * the call chain. + */ + /* Mapping of registers to parameters for syscalls on x86-64 and x32 */ #define SC_X86_64_REGS_TO_ARGS(x, ...) \ __MAP(x,__SC_ARGS \ @@ -21,68 +65,96 @@ struct pt_regs; ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ ,,(unsigned int)regs->di,,(unsigned int)regs->bp) -#ifdef CONFIG_IA32_EMULATION -/* - * For IA32 emulation, we need to handle "compat" syscalls *and* create - * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the - * ia32 regs in the proper order for shared or "common" syscalls. As some - * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. - */ -#define __IA32_COMPAT_SYS_STUB0(x, name) \ - asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs);\ - ALLOW_ERROR_INJECTION(__ia32_compat_sys_##name, ERRNO); \ - asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs)\ +#define __SYS_STUB0(abi, name) \ + long __##abi##_##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \ + long __##abi##_##name(const struct pt_regs *regs) \ + __alias(__do_##name); + +#define __SYS_STUBx(abi, name, ...) \ + long __##abi##_##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \ + long __##abi##_##name(const struct pt_regs *regs) \ { \ - return __se_compat_sys_##name(); \ + return __se_##name(__VA_ARGS__); \ } -#define __IA32_COMPAT_SYS_STUBx(x, name, ...) \ - asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\ - ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO); \ - asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\ +#define __COND_SYSCALL(abi, name) \ + __weak long __##abi##_##name(const struct pt_regs *__unused) \ { \ - return __se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ + return sys_ni_syscall(); \ } +#define __SYS_NI(abi, name) \ + SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); + +#ifdef CONFIG_X86_64 +#define __X64_SYS_STUB0(name) \ + __SYS_STUB0(x64, sys_##name) + +#define __X64_SYS_STUBx(x, name, ...) \ + __SYS_STUBx(x64, sys##name, \ + SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__)) + +#define __X64_COND_SYSCALL(name) \ + __COND_SYSCALL(x64, sys_##name) + +#define __X64_SYS_NI(name) \ + __SYS_NI(x64, sys_##name) +#else /* CONFIG_X86_64 */ +#define __X64_SYS_STUB0(name) +#define __X64_SYS_STUBx(x, name, ...) +#define __X64_COND_SYSCALL(name) +#define __X64_SYS_NI(name) +#endif /* CONFIG_X86_64 */ + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +#define __IA32_SYS_STUB0(name) \ + __SYS_STUB0(ia32, sys_##name) + #define __IA32_SYS_STUBx(x, name, ...) \ - asmlinkage long __ia32_sys##name(const struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__ia32_sys##name, ERRNO); \ - asmlinkage long __ia32_sys##name(const struct pt_regs *regs) \ - { \ - return __se_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ - } + __SYS_STUBx(ia32, sys##name, \ + SC_IA32_REGS_TO_ARGS(x, __VA_ARGS__)) + +#define __IA32_COND_SYSCALL(name) \ + __COND_SYSCALL(ia32, sys_##name) +#define __IA32_SYS_NI(name) \ + __SYS_NI(ia32, sys_##name) +#else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ +#define __IA32_SYS_STUB0(name) +#define __IA32_SYS_STUBx(x, name, ...) +#define __IA32_COND_SYSCALL(name) +#define __IA32_SYS_NI(name) +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_IA32_EMULATION /* - * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias - * named __ia32_sys_*() + * For IA32 emulation, we need to handle "compat" syscalls *and* create + * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the + * ia32 regs in the proper order for shared or "common" syscalls. As some + * syscalls may not be implemented, we need to expand COND_SYSCALL in + * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this + * case as well. */ +#define __IA32_COMPAT_SYS_STUB0(name) \ + __SYS_STUB0(ia32, compat_sys_##name) -#define SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\ - ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ - SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ - asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused) +#define __IA32_COMPAT_SYS_STUBx(x, name, ...) \ + __SYS_STUBx(ia32, compat_sys##name, \ + SC_IA32_REGS_TO_ARGS(x, __VA_ARGS__)) -#define COND_SYSCALL(name) \ - asmlinkage __weak long __x64_sys_##name(const struct pt_regs *__unused) \ - { \ - return sys_ni_syscall(); \ - } \ - asmlinkage __weak long __ia32_sys_##name(const struct pt_regs *__unused)\ - { \ - return sys_ni_syscall(); \ - } +#define __IA32_COMPAT_COND_SYSCALL(name) \ + __COND_SYSCALL(ia32, compat_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); \ - SYSCALL_ALIAS(__ia32_sys_##name, sys_ni_posix_timers) +#define __IA32_COMPAT_SYS_NI(name) \ + __SYS_NI(ia32, compat_sys_##name) #else /* CONFIG_IA32_EMULATION */ +#define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) -#define __IA32_SYS_STUBx(x, fullname, name, ...) +#define __IA32_COMPAT_COND_SYSCALL(name) +#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -92,25 +164,23 @@ struct pt_regs; * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common * with x86_64 obviously do not need such care. */ -#define __X32_COMPAT_SYS_STUB0(x, name, ...) \ - asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs);\ - ALLOW_ERROR_INJECTION(__x32_compat_sys_##name, ERRNO); \ - asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs)\ - { \ - return __se_compat_sys_##name();\ - } +#define __X32_COMPAT_SYS_STUB0(name) \ + __SYS_STUB0(x32, compat_sys_##name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) \ - asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\ - ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO); \ - asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\ - { \ - return __se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ - } + __SYS_STUBx(x32, compat_sys##name, \ + SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__)) + +#define __X32_COMPAT_COND_SYSCALL(name) \ + __COND_SYSCALL(x32, compat_sys_##name) +#define __X32_COMPAT_SYS_NI(name) \ + __SYS_NI(x32, compat_sys_##name) #else /* CONFIG_X86_X32 */ -#define __X32_COMPAT_SYS_STUB0(x, name) +#define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) +#define __X32_COMPAT_COND_SYSCALL(name) +#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32 */ @@ -121,15 +191,12 @@ struct pt_regs; * of them. */ #define COMPAT_SYSCALL_DEFINE0(name) \ - static long __se_compat_sys_##name(void); \ - static inline long __do_compat_sys_##name(void); \ - __IA32_COMPAT_SYS_STUB0(x, name) \ - __X32_COMPAT_SYS_STUB0(x, name) \ - static long __se_compat_sys_##name(void) \ - { \ - return __do_compat_sys_##name(); \ - } \ - static inline long __do_compat_sys_##name(void) + static long \ + __do_compat_sys_##name(const struct pt_regs *__unused); \ + __IA32_COMPAT_SYS_STUB0(name) \ + __X32_COMPAT_SYS_STUB0(name) \ + static long \ + __do_compat_sys_##name(const struct pt_regs *__unused) #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ @@ -148,58 +215,19 @@ struct pt_regs; * kernel/time/posix-stubs.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ - cond_syscall(__ia32_compat_sys_##name); \ - cond_syscall(__x32_compat_sys_##name) + __IA32_COMPAT_COND_SYSCALL(name) \ + __X32_COMPAT_COND_SYSCALL(name) #define COMPAT_SYS_NI(name) \ - SYSCALL_ALIAS(__ia32_compat_sys_##name, sys_ni_posix_timers); \ - SYSCALL_ALIAS(__x32_compat_sys_##name, sys_ni_posix_timers) + __IA32_COMPAT_SYS_NI(name) \ + __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_COMPAT */ - -/* - * Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes - * struct pt_regs *regs as the only argument of the syscall stub named - * __x64_sys_*(). It decodes just the registers it needs and passes them on to - * the __se_sys_*() wrapper performing sign extension and then to the - * __do_sys_*() function doing the actual job. These wrappers and functions - * are inlined (at least in very most cases), meaning that the assembly looks - * as follows (slightly re-ordered for better readability): - * - * <__x64_sys_recv>: <-- syscall with 4 parameters - * callq <__fentry__> - * - * mov 0x70(%rdi),%rdi <-- decode regs->di - * mov 0x68(%rdi),%rsi <-- decode regs->si - * mov 0x60(%rdi),%rdx <-- decode regs->dx - * mov 0x38(%rdi),%rcx <-- decode regs->r10 - * - * xor %r9d,%r9d <-- clear %r9 - * xor %r8d,%r8d <-- clear %r8 - * - * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom() - * which takes 6 arguments - * - * cltq <-- extend return value to 64-bit - * retq <-- return - * - * This approach avoids leaking random user-provided register content down - * the call chain. - * - * If IA32_EMULATION is enabled, this macro generates an additional wrapper - * named __ia32_sys_*() which decodes the struct pt_regs *regs according - * to the i386 calling convention (bx, cx, dx, si, di, bp). - */ #define __SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long __x64_sys##name(const struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__x64_sys##name, ERRNO); \ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ - asmlinkage long __x64_sys##name(const struct pt_regs *regs) \ - { \ - return __se_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ - } \ + __X64_SYS_STUBx(x, name, __VA_ARGS__) \ __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ @@ -217,33 +245,28 @@ struct pt_regs; * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() * macros to work correctly. */ -#ifndef SYSCALL_DEFINE0 #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\ - ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ - asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused) -#endif - -#ifndef COND_SYSCALL -#define COND_SYSCALL(name) \ - asmlinkage __weak long __x64_sys_##name(const struct pt_regs *__unused) \ - { \ - return sys_ni_syscall(); \ - } -#endif + static long __do_sys_##sname(const struct pt_regs *__unused); \ + __X64_SYS_STUB0(sname) \ + __IA32_SYS_STUB0(sname) \ + static long __do_sys_##sname(const struct pt_regs *__unused) + +#define COND_SYSCALL(name) \ + __X64_COND_SYSCALL(name) \ + __IA32_COND_SYSCALL(name) -#ifndef SYS_NI -#define SYS_NI(name) SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); -#endif +#define SYS_NI(name) \ + __X64_SYS_NI(name) \ + __IA32_SYS_NI(name) /* * For VSYSCALLS, we need to declare these three syscalls with the new * pt_regs-based calling convention for in-kernel use. */ -asmlinkage long __x64_sys_getcpu(const struct pt_regs *regs); -asmlinkage long __x64_sys_gettimeofday(const struct pt_regs *regs); -asmlinkage long __x64_sys_time(const struct pt_regs *regs); +long __x64_sys_getcpu(const struct pt_regs *regs); +long __x64_sys_gettimeofday(const struct pt_regs *regs); +long __x64_sys_time(const struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_WRAPPER_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 91b7b6e1a115..6714a358235d 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -8,42 +8,8 @@ #ifndef _ASM_X86_SYSCALLS_H #define _ASM_X86_SYSCALLS_H -#include <linux/compiler.h> -#include <linux/linkage.h> -#include <linux/signal.h> -#include <linux/types.h> - /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on); -#ifdef CONFIG_X86_32 -/* - * These definitions are only valid on pure 32-bit systems; x86-64 uses a - * different syscall calling convention - */ -asmlinkage long sys_ioperm(unsigned long, unsigned long, int); -asmlinkage long sys_iopl(unsigned int); - -/* kernel/ldt.c */ -asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); - -/* kernel/signal.c */ -asmlinkage long sys_rt_sigreturn(void); - -/* kernel/tls.c */ -asmlinkage long sys_set_thread_area(struct user_desc __user *); -asmlinkage long sys_get_thread_area(struct user_desc __user *); - -/* X86_32 only */ - -/* kernel/signal.c */ -asmlinkage long sys_sigreturn(void); - -/* kernel/vm86_32.c */ -struct vm86_struct; -asmlinkage long sys_vm86old(struct vm86_struct __user *); -asmlinkage long sys_vm86(unsigned long, unsigned long); - -#endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index cf4327986e98..8de8ceccb8bc 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -92,7 +92,7 @@ struct thread_info { #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ -#define TIF_NOHZ 19 /* in adaptive nohz mode */ +#define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -122,7 +122,7 @@ struct thread_info { #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) -#define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) @@ -133,19 +133,15 @@ struct thread_info { #define _TIF_X32 (1 << TIF_X32) #define _TIF_FSCHECK (1 << TIF_FSCHECK) -/* - * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for - * enter_from_user_mode() - */ +/* Work to do before invoking the actual syscall. */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ - _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD) /* * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 4b14d2318251..79d8d5496330 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -193,4 +193,29 @@ static inline void sched_clear_itmt_support(void) } #endif /* CONFIG_SCHED_MC_PRIO */ +#ifdef CONFIG_SMP +#include <asm/cpufeature.h> + +DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); + +#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key) + +DECLARE_PER_CPU(unsigned long, arch_freq_scale); + +static inline long arch_scale_freq_capacity(int cpu) +{ + return per_cpu(arch_freq_scale, cpu); +} +#define arch_scale_freq_capacity arch_scale_freq_capacity + +extern void arch_scale_freq_tick(void); +#define arch_scale_freq_tick arch_scale_freq_tick + +extern void arch_set_max_freq_ratio(bool turbo_disabled); +#else +static inline void arch_set_max_freq_ratio(bool turbo_disabled) +{ +} +#endif + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index ffa0dc8a535e..c26a7e1d8a2c 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -76,27 +76,24 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long err dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); -#ifdef CONFIG_X86_64 -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address); -asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); -asmlinkage __visible notrace -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); -void __init trap_init(void); -#endif dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); -#ifdef CONFIG_X86_MCE -dotraplinkage void do_machine_check(struct pt_regs *regs, long error_code); -#endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code); #endif dotraplinkage void do_mce(struct pt_regs *regs, long error_code); +#ifdef CONFIG_X86_64 +asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); +asmlinkage __visible notrace +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); +void __init trap_init(void); +#endif + static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 61d93f062a36..c8247a84244b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -193,23 +193,12 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : : "A" (x), "r" (addr) \ : : label) -#define __put_user_asm_ex_u64(x, addr) \ - asm volatile("\n" \ - "1: movl %%eax,0(%1)\n" \ - "2: movl %%edx,4(%1)\n" \ - "3:" \ - _ASM_EXTABLE_EX(1b, 2b) \ - _ASM_EXTABLE_EX(2b, 3b) \ - : : "A" (x), "r" (addr)) - #define __put_user_x8(x, ptr, __ret_pu) \ asm volatile("call __put_user_8" : "=a" (__ret_pu) \ : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") #else #define __put_user_goto_u64(x, ptr, label) \ __put_user_goto(x, ptr, "q", "", "er", label) -#define __put_user_asm_ex_u64(x, addr) \ - __put_user_asm_ex(x, addr, "q", "", "er") #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) #endif @@ -289,31 +278,6 @@ do { \ } \ } while (0) -/* - * This doesn't do __uaccess_begin/end - the exception handling - * around it must do that. - */ -#define __put_user_size_ex(x, ptr, size) \ -do { \ - __chk_user_ptr(ptr); \ - switch (size) { \ - case 1: \ - __put_user_asm_ex(x, ptr, "b", "b", "iq"); \ - break; \ - case 2: \ - __put_user_asm_ex(x, ptr, "w", "w", "ir"); \ - break; \ - case 4: \ - __put_user_asm_ex(x, ptr, "l", "k", "ir"); \ - break; \ - case 8: \ - __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \ - break; \ - default: \ - __put_user_bad(); \ - } \ -} while (0) - #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval, errret) \ ({ \ @@ -335,12 +299,9 @@ do { \ "i" (errret), "0" (retval)); \ }) -#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() #else #define __get_user_asm_u64(x, ptr, retval, errret) \ __get_user_asm(x, ptr, retval, "q", "", "=r", errret) -#define __get_user_asm_ex_u64(x, ptr) \ - __get_user_asm_ex(x, ptr, "q", "", "=r") #endif #define __get_user_size(x, ptr, size, retval, errret) \ @@ -378,53 +339,6 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) -#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile("\n" \ - "1: mov"itype" %2,%"rtype"1\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: mov %3,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ - : "=r" (err), ltype(x) \ - : "m" (__m(addr)), "i" (errret), "0" (err)) - -/* - * This doesn't do __uaccess_begin/end - the exception handling - * around it must do that. - */ -#define __get_user_size_ex(x, ptr, size) \ -do { \ - __chk_user_ptr(ptr); \ - switch (size) { \ - case 1: \ - __get_user_asm_ex(x, ptr, "b", "b", "=q"); \ - break; \ - case 2: \ - __get_user_asm_ex(x, ptr, "w", "w", "=r"); \ - break; \ - case 4: \ - __get_user_asm_ex(x, ptr, "l", "k", "=r"); \ - break; \ - case 8: \ - __get_user_asm_ex_u64(x, ptr); \ - break; \ - default: \ - (x) = __get_user_bad(); \ - } \ -} while (0) - -#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ - asm volatile("1: mov"itype" %1,%"rtype"0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:xor"itype" %"rtype"0,%"rtype"0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_EX(1b, 3b) \ - : ltype(x) : "m" (__m(addr))) - #define __put_user_nocheck(x, ptr, size) \ ({ \ __label__ __pu_label; \ @@ -480,29 +394,6 @@ struct __large_struct { unsigned long buf[100]; }; retval = __put_user_failed(x, addr, itype, rtype, ltype, errret); \ } while (0) -#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ - asm volatile("1: mov"itype" %"rtype"0,%1\n" \ - "2:\n" \ - _ASM_EXTABLE_EX(1b, 2b) \ - : : ltype(x), "m" (__m(addr))) - -/* - * uaccess_try and catch - */ -#define uaccess_try do { \ - current->thread.uaccess_err = 0; \ - __uaccess_begin(); \ - barrier(); - -#define uaccess_try_nospec do { \ - current->thread.uaccess_err = 0; \ - __uaccess_begin_nospec(); \ - -#define uaccess_catch(err) \ - __uaccess_end(); \ - (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \ -} while (0) - /** * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. @@ -552,28 +443,6 @@ struct __large_struct { unsigned long buf[100]; }; #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -/* - * {get|put}_user_try and catch - * - * get_user_try { - * get_user_ex(...); - * } get_user_catch(err) - */ -#define get_user_try uaccess_try_nospec -#define get_user_catch(err) uaccess_catch(err) - -#define get_user_ex(x, ptr) do { \ - unsigned long __gue_val; \ - __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \ - (x) = (__force __typeof__(*(ptr)))__gue_val; \ -} while (0) - -#define put_user_try uaccess_try -#define put_user_catch(err) uaccess_catch(err) - -#define put_user_ex(x, ptr) \ - __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long @@ -584,99 +453,6 @@ extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); -extern void __cmpxchg_wrong_size(void) - __compiletime_error("Bad argument size for cmpxchg"); - -#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \ -({ \ - int __ret = 0; \ - __typeof__(*(ptr)) __old = (old); \ - __typeof__(*(ptr)) __new = (new); \ - __uaccess_begin_nospec(); \ - switch (size) { \ - case 1: \ - { \ - asm volatile("\n" \ - "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\n" \ - "\t.section .fixup, \"ax\"\n" \ - "3:\tmov %3, %0\n" \ - "\tjmp 2b\n" \ - "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ - : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ - : "i" (-EFAULT), "q" (__new), "1" (__old) \ - : "memory" \ - ); \ - break; \ - } \ - case 2: \ - { \ - asm volatile("\n" \ - "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\n" \ - "\t.section .fixup, \"ax\"\n" \ - "3:\tmov %3, %0\n" \ - "\tjmp 2b\n" \ - "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ - : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ - : "i" (-EFAULT), "r" (__new), "1" (__old) \ - : "memory" \ - ); \ - break; \ - } \ - case 4: \ - { \ - asm volatile("\n" \ - "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\n" \ - "\t.section .fixup, \"ax\"\n" \ - "3:\tmov %3, %0\n" \ - "\tjmp 2b\n" \ - "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ - : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ - : "i" (-EFAULT), "r" (__new), "1" (__old) \ - : "memory" \ - ); \ - break; \ - } \ - case 8: \ - { \ - if (!IS_ENABLED(CONFIG_X86_64)) \ - __cmpxchg_wrong_size(); \ - \ - asm volatile("\n" \ - "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\n" \ - "\t.section .fixup, \"ax\"\n" \ - "3:\tmov %3, %0\n" \ - "\tjmp 2b\n" \ - "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ - : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ - : "i" (-EFAULT), "r" (__new), "1" (__old) \ - : "memory" \ - ); \ - break; \ - } \ - default: \ - __cmpxchg_wrong_size(); \ - } \ - __uaccess_end(); \ - *(uval) = __old; \ - __ret; \ -}) - -#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \ -({ \ - access_ok((ptr), sizeof(*(ptr))) ? \ - __user_atomic_cmpxchg_inatomic((uval), (ptr), \ - (old), (new), sizeof(*(ptr))) : \ - -EFAULT; \ -}) - /* * movsl can be slow when source and dest are not both 8-byte aligned */ @@ -695,15 +471,6 @@ extern struct movsl_mask { #endif /* - * We rely on the nested NMI work to allow atomic faults from the NMI path; the - * nested NMI paths are careful to preserve CR2. - * - * Caller must use pagefault_enable/disable, or run in interrupt context, - * and also do a uaccess_ok() check - */ -#define __copy_from_user_nmi __copy_from_user_inatomic - -/* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() * checking before using them, but you have to surround them with the diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index ba2dc1930630..388a40660c7b 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -23,33 +23,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - ret = 0; - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u8 *)to, from, ret, - "b", "b", "=q", 1); - __uaccess_end(); - return ret; - case 2: - ret = 0; - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u16 *)to, from, ret, - "w", "w", "=r", 2); - __uaccess_end(); - return ret; - case 4: - ret = 0; - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u32 *)to, from, ret, - "l", "k", "=r", 4); - __uaccess_end(); - return ret; - } - } return __copy_user_ll(to, (__force const void *)from, n); } diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 5cd1caa8bc65..bc10e3dc64fe 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -65,117 +65,13 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len) static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { - int ret = 0; - - if (!__builtin_constant_p(size)) - return copy_user_generic(dst, (__force void *)src, size); - switch (size) { - case 1: - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src, - ret, "b", "b", "=q", 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src, - ret, "w", "w", "=r", 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src, - ret, "l", "k", "=r", 4); - __uaccess_end(); - return ret; - case 8: - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, - ret, "q", "", "=r", 8); - __uaccess_end(); - return ret; - case 10: - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, - ret, "q", "", "=r", 10); - if (likely(!ret)) - __get_user_asm_nozero(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); - __uaccess_end(); - return ret; - case 16: - __uaccess_begin_nospec(); - __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, - ret, "q", "", "=r", 16); - if (likely(!ret)) - __get_user_asm_nozero(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); - __uaccess_end(); - return ret; - default: - return copy_user_generic(dst, (__force void *)src, size); - } + return copy_user_generic(dst, (__force void *)src, size); } static __always_inline __must_check unsigned long raw_copy_to_user(void __user *dst, const void *src, unsigned long size) { - int ret = 0; - - if (!__builtin_constant_p(size)) - return copy_user_generic((__force void *)dst, src, size); - switch (size) { - case 1: - __uaccess_begin(); - __put_user_asm(*(u8 *)src, (u8 __user *)dst, - ret, "b", "b", "iq", 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __put_user_asm(*(u16 *)src, (u16 __user *)dst, - ret, "w", "w", "ir", 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __put_user_asm(*(u32 *)src, (u32 __user *)dst, - ret, "l", "k", "ir", 4); - __uaccess_end(); - return ret; - case 8: - __uaccess_begin(); - __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "er", 8); - __uaccess_end(); - return ret; - case 10: - __uaccess_begin(); - __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "er", 10); - if (likely(!ret)) { - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); - } - __uaccess_end(); - return ret; - case 16: - __uaccess_begin(); - __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "er", 16); - if (likely(!ret)) { - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); - } - __uaccess_end(); - return ret; - default: - return copy_user_generic((__force void *)dst, src, size); - } + return copy_user_generic((__force void *)dst, src, size); } static __always_inline __must_check diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index a7dd080749ce..c1c3d31b15c0 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -13,10 +13,13 @@ # define __ARCH_WANT_SYS_OLD_MMAP # define __ARCH_WANT_SYS_OLD_SELECT +# define __NR_ia32_syscall_max __NR_syscall_max + # else # include <asm/unistd_64.h> # include <asm/unistd_64_x32.h> +# include <asm/unistd_32_ia32.h> # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME # define __ARCH_WANT_COMPAT_SYS_PREADV64 @@ -26,6 +29,10 @@ # endif +# define NR_syscalls (__NR_syscall_max + 1) +# define X32_NR_syscalls (__NR_x32_syscall_max + 1) +# define IA32_NR_syscalls (__NR_ia32_syscall_max + 1) + # define __ARCH_WANT_NEW_STAT # define __ARCH_WANT_OLD_READDIR # define __ARCH_WANT_OLD_STAT diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 7803114aa140..13687bf0e0a9 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -858,4 +858,6 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) return 1; } +void uv_bau_message_interrupt(struct pt_regs *regs); + #endif /* _ASM_X86_UV_UV_BAU_H */ diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h new file mode 100644 index 000000000000..119ac8612d89 --- /dev/null +++ b/arch/x86/include/asm/vdso/clocksource.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_CLOCKSOURCE_H +#define __ASM_VDSO_CLOCKSOURCE_H + +#define VDSO_ARCH_CLOCKMODES \ + VDSO_CLOCKMODE_TSC, \ + VDSO_CLOCKMODE_PVCLOCK, \ + VDSO_CLOCKMODE_HVCLOCK + +#endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 6ee1f7dba34b..9a6dc9b4ec99 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -243,7 +243,7 @@ static u64 vread_hvclock(void) static inline u64 __arch_get_hw_counter(s32 clock_mode) { - if (clock_mode == VCLOCK_TSC) + if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) return (u64)rdtsc_ordered(); /* * For any memory-mapped vclock type, we need to make sure that gcc @@ -252,13 +252,13 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode) * question isn't enabled, which will segfault. Hence the barriers. */ #ifdef CONFIG_PARAVIRT_CLOCK - if (clock_mode == VCLOCK_PVCLOCK) { + if (clock_mode == VDSO_CLOCKMODE_PVCLOCK) { barrier(); return vread_pvclock(); } #endif #ifdef CONFIG_HYPERV_TIMER - if (clock_mode == VCLOCK_HVCLOCK) { + if (clock_mode == VDSO_CLOCKMODE_HVCLOCK) { barrier(); return vread_hvclock(); } diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h new file mode 100644 index 000000000000..57b1a7034c64 --- /dev/null +++ b/arch/x86/include/asm/vdso/processor.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 ARM Ltd. + */ +#ifndef __ASM_VDSO_PROCESSOR_H +#define __ASM_VDSO_PROCESSOR_H + +#ifndef __ASSEMBLY__ + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static __always_inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +static __always_inline void cpu_relax(void) +{ + rep_nop(); +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index 0026ab2123ce..be199a9b2676 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -10,8 +10,6 @@ #include <asm/vgtod.h> #include <asm/vvar.h> -int vclocks_used __read_mostly; - DEFINE_VVAR(struct vdso_data, _vdso_data); /* * Update the vDSO data page to keep in sync with kernel timekeeping. @@ -23,19 +21,6 @@ struct vdso_data *__x86_get_k_vdso_data(void) } #define __arch_get_k_vdso_data __x86_get_k_vdso_data -static __always_inline -int __x86_get_clock_mode(struct timekeeper *tk) -{ - int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - - /* Mark the new vclock used. */ - BUILD_BUG_ON(VCLOCK_MAX >= 32); - WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); - - return vclock_mode; -} -#define __arch_get_clock_mode __x86_get_clock_mode - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index a2638c6124ed..7aa38b2ad8a9 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -2,6 +2,11 @@ #ifndef _ASM_X86_VGTOD_H #define _ASM_X86_VGTOD_H +/* + * This check is required to prevent ARCH=um to include + * unwanted headers. + */ +#ifdef CONFIG_GENERIC_GETTIMEOFDAY #include <linux/compiler.h> #include <asm/clocksource.h> #include <vdso/datapage.h> @@ -14,11 +19,6 @@ typedef u64 gtod_long_t; #else typedef unsigned long gtod_long_t; #endif - -extern int vclocks_used; -static inline bool vclock_was_used(int vclock) -{ - return READ_ONCE(vclocks_used) & (1 << vclock); -} +#endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2a85287b3685..8521af3fef27 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,7 @@ #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) -#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000 +#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index a50e4a0de315..9915990fd8cf 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -81,6 +81,7 @@ #define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */ #define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* "" Processor Trace logs GPAs */ #define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ +#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 503d3f42da16..3f3f780c8c65 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -390,6 +390,7 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 #define KVM_STATE_NESTED_EVMCS 0x00000004 +#define KVM_STATE_NESTED_MTF_PENDING 0x00000008 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b294c13809a..bb5abfef0256 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,6 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y @@ -53,6 +52,8 @@ obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o +obj-$(CONFIG_X86_32) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 04205ce127a1..1ae5439a9a85 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -45,6 +45,7 @@ EXPORT_SYMBOL(acpi_disabled); #define PREFIX "ACPI: " int acpi_noirq; /* skip ACPI IRQ initialization */ +int acpi_nobgrt; /* skip ACPI BGRT */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); @@ -1619,7 +1620,7 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); - if (IS_ENABLED(CONFIG_ACPI_BGRT)) + if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); if (!acpi_noirq) @@ -1671,6 +1672,13 @@ static int __init parse_acpi(char *arg) } early_param("acpi", parse_acpi); +static int __init parse_acpi_bgrt(char *arg) +{ + acpi_nobgrt = true; + return 0; +} +early_param("bgrt_disable", parse_acpi_bgrt); + /* FIXME: Using pci= for an ACPI parameter is a travesty. */ static int __init parse_pci(char *arg) { @@ -1740,7 +1748,7 @@ int __acpi_acquire_global_lock(unsigned int *lock) new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); val = cmpxchg(lock, old, new); } while (unlikely (val != old)); - return (new < 3) ? -1 : 0; + return ((new & 0x3) < 3) ? -1 : 0; } int __acpi_release_global_lock(unsigned int *lock) diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 26b7256f590f..ed3b04483972 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -43,7 +43,7 @@ unsigned long acpi_get_wakeup_address(void) * * Wrapper around acpi_enter_sleep_state() to be called by assmebly. */ -acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) +asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index d06c2079b6c1..171a40c74db6 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -19,4 +19,4 @@ extern void do_suspend_lowlevel(void); extern int x86_acpi_suspend_lowlevel(void); -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state); +asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 15ac0d5f4b40..7867dfb3963e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1167,8 +1167,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries atomic_cond_read_acquire(&desc.refs, !VAL); } -void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) { struct insn insn; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 69aed0ebbdfc..b6b3297851f3 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -36,10 +36,9 @@ static const struct pci_device_id amd_root_ids[] = { {} }; - #define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 -const struct pci_device_id amd_nb_misc_ids[] = { +static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, @@ -56,7 +55,6 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, {} }; -EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5f973fed3c9f..81b9c63dae1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -546,12 +546,6 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -#define DEADLINE_MODEL_MATCH_FUNC(model, func) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func } - -#define DEADLINE_MODEL_MATCH_REV(model, rev) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev } - static u32 hsx_deadline_rev(void) { switch (boot_cpu_data.x86_stepping) { @@ -588,23 +582,23 @@ static u32 skx_deadline_rev(void) } static const struct x86_cpu_id deadline_match[] = { - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_X, &skx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), {}, }; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 2c5676b0a6e7..67768e54438b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -557,6 +557,12 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irqd->hwirq = virq + i; irqd_set_single_target(irqd); /* + * Prevent that any of these interrupts is invoked in + * non interrupt context via e.g. generic_handle_irq() + * as that can corrupt the affinity move state. + */ + irqd_set_handle_enforce_irqctx(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is * required for check_timer() to work correctly as it might @@ -838,13 +844,15 @@ static void free_moved_vector(struct apic_chip_data *apicd) bool managed = apicd->is_managed; /* - * This should never happen. Managed interrupts are not - * migrated except on CPU down, which does not involve the - * cleanup vector. But try to keep the accounting correct - * nevertheless. + * Managed interrupts are usually not migrated away + * from an online CPU, but CPU isolation 'managed_irq' + * can make that happen. + * 1) Activation does not take the isolation into account + * to keep the code simple + * 2) Migration away from an isolated CPU can happen when + * a non-isolated CPU which is in the calculated + * affinity mask comes online. */ - WARN_ON_ONCE(managed); - trace_vector_free_moved(apicd->irq, cpu, vector, managed); irq_matrix_free(vector_matrix, cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c7ee3df4d0b..3ca07ad552ae 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -88,7 +88,6 @@ static void __used common(void) OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); - OFFSET(BP_code32_start, boot_params, hdr.code32_start); BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 82826f2275cc..6e043f295a60 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,12 +3,9 @@ # error "Please do not build this file directly, build asm-offsets.c instead" #endif -#include <asm/ucontext.h> +#include <linux/efi.h> -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls[] = { -#include <asm/syscalls_32.h> -}; +#include <asm/ucontext.h> /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -62,6 +59,5 @@ void foo(void) #endif BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); - DEFINE(NR_syscalls, sizeof(syscalls)); + DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 24d2fde30d00..c2a47016f243 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,30 +5,6 @@ #include <asm/ia32.h> -#define __SYSCALL_64(nr, sym, qual) [nr] = 1, -#define __SYSCALL_X32(nr, sym, qual) -static char syscalls_64[] = { -#include <asm/syscalls_64.h> -}; -#undef __SYSCALL_64 -#undef __SYSCALL_X32 - -#ifdef CONFIG_X86_X32_ABI -#define __SYSCALL_64(nr, sym, qual) -#define __SYSCALL_X32(nr, sym, qual) [nr] = 1, -static char syscalls_x32[] = { -#include <asm/syscalls_64.h> -}; -#undef __SYSCALL_64 -#undef __SYSCALL_X32 -#endif - -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls_ia32[] = { -#include <asm/syscalls_32.h> -}; -#undef __SYSCALL_I386 - #if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) #include <asm/kvm_para.h> #endif @@ -90,17 +66,5 @@ int main(void) DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif - - DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); - DEFINE(NR_syscalls, sizeof(syscalls_64)); - -#ifdef CONFIG_X86_X32_ABI - DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); - DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); -#endif - - DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); - DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); - return 0; } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ac83a0fef628..547ad7bbf0e0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -28,6 +28,7 @@ static const int amd_erratum_383[]; static const int amd_erratum_400[]; +static const int amd_erratum_1054[]; static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); /* @@ -393,6 +394,35 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; } +static void amd_detect_ppin(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + if (!cpu_has(c, X86_FEATURE_AMD_PPIN)) + return; + + /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */ + if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val)) + goto clear_ppin; + + /* PPIN is locked in disabled mode, clear feature bit */ + if ((val & 3UL) == 1UL) + goto clear_ppin; + + /* If PPIN is disabled, try to enable it */ + if (!(val & 2UL)) { + wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_AMD_PPIN_CTL, &val); + } + + /* If PPIN_EN bit is 1, return from here; otherwise fall through */ + if (val & 2UL) + return; + +clear_ppin: + clear_cpu_cap(c, X86_FEATURE_AMD_PPIN); +} + u16 amd_get_nb_id(int cpu) { return per_cpu(cpu_llc_id, cpu); @@ -925,7 +955,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: init_amd_zn(c); break; + case 0x17: fallthrough; + case 0x19: init_amd_zn(c); break; } /* @@ -940,6 +971,7 @@ static void init_amd(struct cpuinfo_x86 *c) amd_detect_cmp(c); amd_get_topology(c); srat_detect_node(c); + amd_detect_ppin(c); init_amd_cacheinfo(c); @@ -972,6 +1004,15 @@ static void init_amd(struct cpuinfo_x86 *c) /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + /* + * Turn on the Instructions Retired free counter on machines not + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (cpu_has(c, X86_FEATURE_IRPERF) && + !cpu_has_amd_erratum(c, amd_erratum_1054)) + msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); } #ifdef CONFIG_X86_32 @@ -1099,6 +1140,10 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52c9bfbbdb2a..bed0cb83fe24 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -445,7 +445,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) * cpuid bit to be set. We need to ensure that we * update that bit in this CPU's "cpu_info". */ - get_cpu_cap(c); + set_cpu_cap(c, X86_FEATURE_OSPKE); } #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -1008,8 +1008,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) -#define VULNWL(_vendor, _family, _model, _whitelist) \ - { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } +#define VULNWL(vendor, family, model, whitelist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) #define VULNWL_INTEL(model, whitelist) \ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) @@ -1224,6 +1224,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); + cpu_set_core_cap_bits(c); + fpu__init_system(c); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 0268185bef94..29a3bedabd06 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -5,6 +5,7 @@ #include <asm/msr-index.h> #include <asm/processor.h> #include <asm/vmx.h> +#include "cpu.h" #undef pr_fmt #define pr_fmt(fmt) "x86/cpu: " fmt diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be82cd5841c3..9a26e972cdea 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -19,6 +19,8 @@ #include <asm/microcode_intel.h> #include <asm/hwcap2.h> #include <asm/elf.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -31,6 +33,20 @@ #include <asm/apic.h> #endif +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, +}; + +/* + * Default to sld_off because most systems do not support split lock detection + * split_lock_setup() will switch this to sld_warn on systems that support + * split lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -570,6 +586,8 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } +static void split_lock_init(void); + static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -684,6 +702,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_enable(); if (tsx_ctrl_state == TSX_CTRL_DISABLE) tsx_disable(); + + split_lock_init(); } #ifdef CONFIG_X86_32 @@ -945,3 +965,166 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); + +#undef pr_fmt +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, +}; + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init split_lock_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + + switch (state) { + case sld_off: + pr_info("disabled\n"); + return; + case sld_warn: + pr_info("warning about user-space split_locks\n"); + break; + case sld_fatal: + pr_info("sending SIGBUS on user-space split_locks\n"); + break; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + sld_state = state; + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +static void split_lock_init(void) +{ + split_lock_verify_msr(sld_state != sld_off); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + + /* + * Disable the split lock detection for this task so it can make + * progress and set TIF_SLD so the detection is re-enabled via + * switch_to_sld() when the task is scheduled out. + */ + sld_update_msr(false); + set_tsk_thread_flag(current, TIF_SLD); + return true; +} + +/* + * This function is called only when switching between tasks with + * different split-lock detection modes. It sets the MSR for the + * mode of the new task. This is right most of the time, but since + * the MSR is shared by hyperthreads on a physical core there can + * be glitches when the two threads need different modes. + */ +void switch_to_sld(unsigned long tifn) +{ + sld_update_msr(!(tifn & _TIF_SLD)); +} + +#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY} + +/* + * The following processors have the split lock detection feature. But + * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot + * be enumerated. Enable it by family and model matching on these + * processors. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X), + SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L), + {} +}; + +void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_core_caps = 0; + + if (c->x86_vendor != X86_VENDOR_INTEL) + return; + if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) { + /* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + } else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + /* Enumerate split lock detection by family and model. */ + if (x86_match_cpu(split_lock_cpu_ids)) + ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT; + } + + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + split_lock_setup(); +} diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 6dd78d8235e4..d3482eb43ff3 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -16,12 +16,17 @@ * respective wildcard entries. * * A typical table entry would be to match a specific CPU - * { X86_VENDOR_INTEL, 6, 0x12 } - * or to match a specific CPU feature - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index b3a50d962851..52de616a8065 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1163,9 +1163,12 @@ static const struct sysfs_ops threshold_ops = { .store = store, }; +static void threshold_block_release(struct kobject *kobj); + static struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, .default_attrs = default_attrs, + .release = threshold_block_release, }; static const char *get_name(unsigned int bank, struct threshold_block *b) @@ -1198,8 +1201,9 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return buf_mcatype; } -static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, - unsigned int block, u32 address) +static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb, + unsigned int bank, unsigned int block, + u32 address) { struct threshold_block *b = NULL; u32 low, high; @@ -1243,16 +1247,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, INIT_LIST_HEAD(&b->miscj); - if (per_cpu(threshold_banks, cpu)[bank]->blocks) { - list_add(&b->miscj, - &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - } else { - per_cpu(threshold_banks, cpu)[bank]->blocks = b; - } + if (tb->blocks) + list_add(&b->miscj, &tb->blocks->miscj); + else + tb->blocks = b; - err = kobject_init_and_add(&b->kobj, &threshold_ktype, - per_cpu(threshold_banks, cpu)[bank]->kobj, - get_name(bank, b)); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); if (err) goto out_free; recurse: @@ -1260,7 +1260,7 @@ recurse: if (!address) return 0; - err = allocate_threshold_blocks(cpu, bank, block, address); + err = allocate_threshold_blocks(cpu, tb, bank, block, address); if (err) goto out_free; @@ -1345,8 +1345,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out_free; } - per_cpu(threshold_banks, cpu)[bank] = b; - if (is_shared_bank(bank)) { refcount_set(&b->cpus, 1); @@ -1357,9 +1355,13 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); - if (!err) - goto out; + err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); + if (err) + goto out_free; + + per_cpu(threshold_banks, cpu)[bank] = b; + + return 0; out_free: kfree(b); @@ -1368,8 +1370,12 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) return err; } -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) +static void threshold_block_release(struct kobject *kobj) +{ + kfree(to_block(kobj)); +} + +static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { struct threshold_block *pos = NULL; struct threshold_block *tmp = NULL; @@ -1379,13 +1385,11 @@ static void deallocate_threshold_block(unsigned int cpu, return; list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_put(&pos->kobj); list_del(&pos->miscj); - kfree(pos); + kobject_put(&pos->kobj); } - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; + kobject_put(&head->blocks->kobj); } static void __threshold_remove_blocks(struct threshold_bank *b) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2c4f949611e4..54165f3569e8 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -142,6 +142,8 @@ void mce_setup(struct mce *m) if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) rdmsrl(MSR_PPIN, m->ppin); + else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) + rdmsrl(MSR_AMD_PPIN, m->ppin); m->microcode = boot_cpu_data.microcode; } @@ -1213,8 +1215,14 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * On Intel systems this is entered on all CPUs in parallel through * MCE broadcast. However some CPUs might be broken beyond repair, * so be always careful when synchronizing with others. + * + * Tracing and kprobes are disabled: if we interrupted a kernel context + * with IF=1, we need to minimize stack usage. There are also recursion + * issues: if the machine check was due to a failure of the memory + * backing the user stack, tracing that reads the user stack will cause + * potentially infinite recursion. */ -void do_machine_check(struct pt_regs *regs, long error_code) +void notrace do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1360,6 +1368,7 @@ out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); +NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) @@ -1877,6 +1886,8 @@ bool filter_mce(struct mce *m) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return amd_filter_mce(m); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return intel_filter_mce(m); return false; } @@ -1892,10 +1903,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code) { machine_check_vector(regs, error_code); } +NOKPROBE_SYMBOL(do_mce); /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 7c8958dee103..d089567a9ce8 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -29,11 +29,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL }; * separate MCEs from kernel messages to avoid bogus bug reports. */ -static struct mce_log_buffer mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; +static struct mce_log_buffer *mcelog; static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); @@ -45,21 +41,21 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, mutex_lock(&mce_chrdev_read_mutex); - entry = mcelog.next; + entry = mcelog->next; /* * When the buffer fills up discard new entries. Assume that the * earlier errors are the more interesting ones: */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + if (entry >= mcelog->len) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags); goto unlock; } - mcelog.next = entry + 1; + mcelog->next = entry + 1; - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - mcelog.entry[entry].finished = 1; + memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); + mcelog->entry[entry].finished = 1; /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -214,21 +210,21 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, /* Only supports full reads right now */ err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + if (*off != 0 || usize < mcelog->len * sizeof(struct mce)) goto out; - next = mcelog.next; + next = mcelog->next; err = 0; for (i = 0; i < next; i++) { - struct mce *m = &mcelog.entry[i]; + struct mce *m = &mcelog->entry[i]; err |= copy_to_user(buf, m, sizeof(*m)); buf += sizeof(*m); } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog->entry, 0, next * sizeof(struct mce)); + mcelog->next = 0; if (err) err = -EFAULT; @@ -242,7 +238,7 @@ out: static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) + if (READ_ONCE(mcelog->next)) return EPOLLIN | EPOLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return EPOLLIN | EPOLLRDNORM; @@ -261,13 +257,13 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, case MCE_GET_RECORD_LEN: return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); + return put_user(mcelog->len, p); case MCE_GETCLEAR_FLAGS: { unsigned flags; do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + flags = mcelog->flags; + } while (cmpxchg(&mcelog->flags, flags, 0) != flags); return put_user(flags, p); } @@ -339,8 +335,18 @@ static struct miscdevice mce_chrdev_device = { static __init int dev_mcelog_init_device(void) { + int mce_log_len; int err; + mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); + mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL); + if (!mcelog) + return -ENOMEM; + + strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + mcelog->len = mce_log_len; + mcelog->recordlen = sizeof(struct mce); + /* register character device /dev/mcelog */ err = misc_register(&mce_chrdev_device); if (err) { @@ -350,6 +356,7 @@ static __init int dev_mcelog_init_device(void) else pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + kfree(mcelog); return err; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 5627b1091b85..d8f9230d2034 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -493,17 +493,18 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) return; if ((val & 3UL) == 1UL) { - /* PPIN available but disabled: */ + /* PPIN locked in disabled mode */ return; } - /* If PPIN is disabled, but not locked, try to enable: */ - if (!(val & 3UL)) { + /* If PPIN is disabled, try to enable */ + if (!(val & 2UL)) { wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); rdmsrl_safe(MSR_PPIN_CTL, &val); } - if ((val & 3UL) == 2UL) + /* Is the enable bit set? */ + if (val & 2UL) set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); } } @@ -520,3 +521,20 @@ void mce_intel_feature_clear(struct cpuinfo_x86 *c) { intel_clear_lmce(); } + +bool intel_filter_mce(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */ + if ((c->x86 == 6) && + ((c->x86_model == INTEL_FAM6_HASWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_L) || + (c->x86_model == INTEL_FAM6_BROADWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_G)) && + (m->bank == 0) && + ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b785c0d0b590..3b008172ad73 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -8,6 +8,9 @@ #include <linux/device.h> #include <asm/mce.h> +/* Pointer to the installed machine check handler for this CPU setup. */ +extern void (*machine_check_vector)(struct pt_regs *, long error_code); + enum severity_level { MCE_NO_SEVERITY, MCE_DEFERRED_SEVERITY, @@ -48,6 +51,7 @@ void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); +bool intel_filter_mce(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -56,6 +60,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } +static inline bool intel_filter_mce(struct mce *m) { return false; }; #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index 58b4ee3cda77..f36dc0742085 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -486,9 +486,14 @@ static int thermal_throttle_offline(unsigned int cpu) { struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + u32 l; + + /* Mask the thermal vector before draining evtl. pending work */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); - cancel_delayed_work(&state->package_throttle.therm_work); - cancel_delayed_work(&state->core_throttle.therm_work); + cancel_delayed_work_sync(&state->package_throttle.therm_work); + cancel_delayed_work_sync(&state->core_throttle.therm_work); state->package_throttle.rate_control_active = false; state->core_throttle.rate_control_active = false; diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index c222f283b456..300e3fd5ade3 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <asm/msr.h> +#include <asm/mwait.h> #define UMWAIT_C02_ENABLE 0 diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 46d732696c1c..9b6fafa69be9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -25,6 +25,8 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/reboot.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -47,6 +49,11 @@ #define VMWARE_CMD_GETVCPU_INFO 68 #define VMWARE_CMD_LEGACY_X2APIC 3 #define VMWARE_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_STEALCLOCK 91 + +#define STEALCLOCK_NOT_AVAILABLE (-1) +#define STEALCLOCK_DISABLED 0 +#define STEALCLOCK_ENABLED 1 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx), %%eax" : \ @@ -86,6 +93,18 @@ } \ } while (0) +struct vmware_steal_time { + union { + uint64_t clock; /* stolen time counter in units of vtsc */ + struct { + /* only for little-endian */ + uint32_t clock_low; + uint32_t clock_high; + }; + }; + uint64_t reserved[7]; +}; + static unsigned long vmware_tsc_khz __ro_after_init; static u8 vmware_hypercall_mode __ro_after_init; @@ -103,15 +122,25 @@ static unsigned long vmware_get_tsc_khz(void) #ifdef CONFIG_PARAVIRT static struct cyc2ns_data vmware_cyc2ns __ro_after_init; -static int vmw_sched_clock __initdata = 1; +static bool vmw_sched_clock __initdata = true; +static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64); +static bool has_steal_clock; +static bool steal_acc __initdata = true; /* steal time accounting */ static __init int setup_vmw_sched_clock(char *s) { - vmw_sched_clock = 0; + vmw_sched_clock = false; return 0; } early_param("no-vmw-sched-clock", setup_vmw_sched_clock); +static __init int parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} +early_param("no-steal-acc", parse_no_stealacc); + static unsigned long long notrace vmware_sched_clock(void) { unsigned long long ns; @@ -122,7 +151,7 @@ static unsigned long long notrace vmware_sched_clock(void) return ns; } -static void __init vmware_sched_clock_setup(void) +static void __init vmware_cyc2ns_setup(void) { struct cyc2ns_data *d = &vmware_cyc2ns; unsigned long long tsc_now = rdtsc(); @@ -132,17 +161,201 @@ static void __init vmware_sched_clock_setup(void) d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, d->cyc2ns_shift); - pv_ops.time.sched_clock = vmware_sched_clock; - pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); + pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); +} + +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + + asm volatile (VMWARE_HYPERCALL : + "=a"(result), + "=c"(info) : + "a"(VMWARE_HYPERVISOR_MAGIC), + "b"(0), + "c"(VMWARE_CMD_STEALCLOCK), + "d"(0), + "S"(arg1), + "D"(arg2) : + "memory"); + return result; +} + +static bool stealclock_enable(phys_addr_t pa) +{ + return vmware_cmd_stealclock(upper_32_bits(pa), + lower_32_bits(pa)) == STEALCLOCK_ENABLED; +} + +static int __stealclock_disable(void) +{ + return vmware_cmd_stealclock(0, 1); +} + +static void stealclock_disable(void) +{ + __stealclock_disable(); +} + +static bool vmware_is_stealclock_available(void) +{ + return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE; +} + +/** + * vmware_steal_clock() - read the per-cpu steal clock + * @cpu: the cpu number whose steal clock we want to read + * + * The function reads the steal clock if we are on a 64-bit system, otherwise + * reads it in parts, checking that the high part didn't change in the + * meantime. + * + * Return: + * The steal clock reading in ns. + */ +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); + uint64_t clock; + + if (IS_ENABLED(CONFIG_64BIT)) + clock = READ_ONCE(steal->clock); + else { + uint32_t initial_high, low, high; + + do { + initial_high = READ_ONCE(steal->clock_high); + /* Do not reorder initial_high and high readings */ + virt_rmb(); + low = READ_ONCE(steal->clock_low); + /* Keep low reading in between */ + virt_rmb(); + high = READ_ONCE(steal->clock_high); + } while (initial_high != high); + + clock = ((uint64_t)high << 32) | low; + } + + return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu); + + if (!has_steal_clock) + return; + + if (!stealclock_enable(slow_virt_to_phys(st))) { + has_steal_clock = false; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } +static void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + stealclock_disable(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +static void vmware_pv_guest_cpu_reboot(void *unused) +{ + vmware_disable_steal_time(); +} + +static int vmware_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block vmware_pv_reboot_nb = { + .notifier_call = vmware_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static int vmware_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + vmware_guest_cpu_init(); + local_irq_enable(); + return 0; +} + +static int vmware_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + vmware_disable_steal_time(); + local_irq_enable(); + return 0; +} +#endif + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware hypervisor"; pv_ops.cpu.io_delay = paravirt_nop; - if (vmware_tsc_khz && vmw_sched_clock) - vmware_sched_clock_setup(); + if (vmware_tsc_khz == 0) + return; + + vmware_cyc2ns_setup(); + + if (vmw_sched_clock) + pv_ops.time.sched_clock = vmware_sched_clock; + + if (vmware_is_stealclock_available()) { + has_steal_clock = true; + pv_ops.time.steal_clock = vmware_steal_clock; + + /* We use reboot notifier only to disable steal clock */ + register_reboot_notifier(&vmware_pv_reboot_nb); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = + vmware_smp_prepare_boot_cpu; + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/vmware:online", + vmware_cpu_online, + vmware_cpu_down_prepare) < 0) + pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n"); +#else + vmware_guest_cpu_init(); +#endif + } } #else #define vmware_paravirt_ops_setup() do {} while (0) @@ -213,7 +426,7 @@ static void __init vmware_platform_setup(void) vmware_set_capabilities(); } -static u8 vmware_select_hypercall(void) +static u8 __init vmware_select_hypercall(void) { int eax, ebx, ecx, edx; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a1806598aaa4..32b153d38748 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -120,11 +120,6 @@ static bool xfeature_is_supervisor(int xfeature_nr) return ecx & 1; } -static bool xfeature_is_user(int xfeature_nr) -{ - return !xfeature_is_supervisor(xfeature_nr); -} - /* * When executing XSAVEOPT (or other optimized XSAVE instructions), if * a processor implementation detects that an FPU state component is still @@ -265,21 +260,25 @@ static void __init setup_xstate_features(void) cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_sizes[i] = eax; + /* - * If an xfeature is supervisor state, the offset - * in EBX is invalid. We leave it to -1. + * If an xfeature is supervisor state, the offset in EBX is + * invalid, leave it to -1. */ - if (xfeature_is_user(i)) - xstate_offsets[i] = ebx; + if (xfeature_is_supervisor(i)) + continue; + + xstate_offsets[i] = ebx; - xstate_sizes[i] = eax; /* - * In our xstate size checks, we assume that the - * highest-numbered xstate feature has the - * highest offset in the buffer. Ensure it does. + * In our xstate size checks, we assume that the highest-numbered + * xstate feature has the highest offset in the buffer. Ensure + * it does. */ WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); + "x86/fpu: misordered xstate at %d\n", last_good_offset); + last_good_offset = xstate_offsets[i]; } } @@ -326,6 +325,13 @@ static int xfeature_is_aligned(int xfeature_nr) u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); + + if (!xfeature_enabled(xfeature_nr)) { + WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n", + xfeature_nr); + return 0; + } + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); /* * The value returned by ECX[1] indicates the alignment @@ -338,11 +344,11 @@ static int xfeature_is_aligned(int xfeature_nr) /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format - * of the xsave aread. + * of the xsave area. */ -static void __init setup_xstate_comp(void) +static void __init setup_xstate_comp_offsets(void) { - unsigned int xstate_comp_sizes[XFEATURE_MAX]; + unsigned int next_offset; int i; /* @@ -356,31 +362,23 @@ static void __init setup_xstate_comp(void) if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) { + if (xfeature_enabled(i)) xstate_comp_offsets[i] = xstate_offsets[i]; - xstate_comp_sizes[i] = xstate_sizes[i]; - } } return; } - xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = - FXSAVE_SIZE + XSAVE_HDR_SIZE; + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_sizes[i] = xstate_sizes[i]; - else - xstate_comp_sizes[i] = 0; + if (!xfeature_enabled(i)) + continue; - if (i > FIRST_EXTENDED_XFEATURE) { - xstate_comp_offsets[i] = xstate_comp_offsets[i-1] - + xstate_comp_sizes[i-1]; + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); - if (xfeature_is_aligned(i)) - xstate_comp_offsets[i] = - ALIGN(xstate_comp_offsets[i], 64); - } + xstate_comp_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; } } @@ -774,7 +772,7 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); - setup_xstate_comp(); + setup_xstate_comp_offsets(); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", @@ -897,8 +895,6 @@ const void *get_xsave_field_ptr(int xfeature_nr) #ifdef CONFIG_ARCH_HAS_PKEYS -#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) -#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. @@ -917,6 +913,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + WARN_ON_ONCE(pkey >= arch_max_pkey()); + /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3923ab4630d7..f66a6b90f954 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -67,11 +67,6 @@ __HEAD SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - us to not reload segments */ - testb $KEEP_SEGMENTS, BP_loadflags(%esi) - jnz 2f - /* * Set segments to known values. */ @@ -82,7 +77,6 @@ SYM_CODE_START(startup_32) movl %eax,%fs movl %eax,%gs movl %eax,%ss -2: leal -__PAGE_OFFSET(%ecx),%esp /* diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 4d4f5d9faac3..7dfb1e808928 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -10,8 +10,6 @@ extern struct boot_params boot_params; static enum efi_secureboot_mode get_sb_mode(void) { - efi_char16_t efi_SecureBoot_name[] = L"SecureBoot"; - efi_char16_t efi_SetupMode_name[] = L"SecureBoot"; efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; efi_status_t status; unsigned long size; @@ -19,13 +17,13 @@ static enum efi_secureboot_mode get_sb_mode(void) size = sizeof(secboot); - if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { pr_info("ima: secureboot mode unknown, no efi\n"); return efi_secureboot_mode_unknown; } /* Get variable contents into buffer */ - status = efi.get_variable(efi_SecureBoot_name, &efi_variable_guid, + status = efi.get_variable(L"SecureBoot", &efi_variable_guid, NULL, &size, &secboot); if (status == EFI_NOT_FOUND) { pr_info("ima: secureboot mode disabled\n"); @@ -38,7 +36,7 @@ static enum efi_secureboot_mode get_sb_mode(void) } size = sizeof(setupmode); - status = efi.get_variable(efi_SetupMode_name, &efi_variable_guid, + status = efi.get_variable(L"SetupMode", &efi_variable_guid, NULL, &size, &setupmode); if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8abeee0dd7bf..a53e7b4a7419 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -13,6 +13,7 @@ #include <asm/io_bitmap.h> #include <asm/desc.h> +#include <asm/syscalls.h> #ifdef CONFIG_X86_IOPL_IOPERM diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 21efee32e2b1..c7965ff429c5 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -230,7 +230,7 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +__visible void __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc * desc; @@ -263,7 +263,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); - return 1; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 16919a9671fa..5aa523c2d573 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -44,15 +44,6 @@ * (these are usually mapped into the 0x30-0xff vector range) */ -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", - .flags = IRQF_NO_THREAD, -}; - DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; @@ -84,7 +75,7 @@ void __init init_IRQ(void) * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If - * these IRQ's are handled by more mordern controllers like IO-APIC, + * these IRQs are handled by more modern controllers like IO-APIC, * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ @@ -104,6 +95,9 @@ void __init native_init_IRQ(void) idt_setup_apic_and_irq_gates(); lapic_assign_system_vectors(); - if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) - setup_irq(2, &irq2); + if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { + /* IRQ2 is cascade interrupt to second interrupt controller */ + if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL)) + pr_err("%s: request_irq() failed\n", "cascade"); + } } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 9c4498ea0b3c..5ba8477c2cb7 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -58,7 +58,7 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, return code; } -static void inline __jump_label_transform(struct jump_entry *entry, +static inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f293d872602a..db6578d45157 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -141,9 +141,8 @@ prepare_add_efi_setup_data(struct boot_params *params, struct setup_data *sd = (void *)params + efi_setup_data_offset; struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); - esd->fw_vendor = efi.fw_vendor; - esd->runtime = efi.runtime; - esd->tables = efi.config_table; + esd->fw_vendor = efi_fw_vendor; + esd->tables = efi_config_table; esd->smbios = efi.smbios; sd->type = SETUP_EFI; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3f45b5c43a71..ea13f6888284 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -71,6 +71,21 @@ found: return (unsigned long)buf; } +static void synthesize_clac(kprobe_opcode_t *addr) +{ + /* + * Can't be static_cpu_has() due to how objtool treats this feature bit. + * This isn't a fast path anyway. + */ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return; + + /* Replace the NOP3 with CLAC */ + addr[0] = 0x0f; + addr[1] = 0x01; + addr[2] = 0xca; +} + /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) { @@ -92,6 +107,9 @@ asm ( /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 SAVE_REGS_STRING " movq %rsp, %rsi\n" ".global optprobe_template_val\n" @@ -111,6 +129,9 @@ asm ( #else /* CONFIG_X86_32 */ " pushl %esp\n" " pushfl\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 SAVE_REGS_STRING " movl %esp, %edx\n" ".global optprobe_template_val\n" @@ -134,6 +155,8 @@ asm ( void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); +#define TMPL_CLAC_IDX \ + ((long)optprobe_template_clac - (long)optprobe_template_entry) #define TMPL_MOVE_IDX \ ((long)optprobe_template_val - (long)optprobe_template_entry) #define TMPL_CALL_IDX \ @@ -389,6 +412,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, op->optinsn.size = ret; len = TMPL_END_IDX + op->optinsn.size; + synthesize_clac(buf + TMPL_CLAC_IDX); + /* Set probe information */ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d817f255aed8..6efe0410fb72 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -425,7 +425,29 @@ static void __init sev_map_percpu_data(void) } } +static bool pv_tlb_flush_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); +} + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + #ifdef CONFIG_SMP + +static bool pv_ipi_supported(void) +{ + return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); +} + +static bool pv_sched_yield_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); +} + #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) static void __send_ipi_mask(const struct cpumask *mask, int vector) @@ -490,12 +512,12 @@ static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) { unsigned int this_cpu = smp_processor_id(); - struct cpumask new_mask; + struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); const struct cpumask *local_mask; - cpumask_copy(&new_mask, mask); - cpumask_clear_cpu(this_cpu, &new_mask); - local_mask = &new_mask; + cpumask_copy(new_mask, mask); + cpumask_clear_cpu(this_cpu, new_mask); + local_mask = new_mask; __send_ipi_mask(local_mask, vector); } @@ -575,7 +597,6 @@ static void __init kvm_apf_trap_init(void) update_intr_gate(X86_TRAP_PF, async_page_fault); } -static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask); static void kvm_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) @@ -583,7 +604,7 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask, u8 state; int cpu; struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask); + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); cpumask_copy(flushmask, cpumask); /* @@ -619,11 +640,10 @@ static void __init kvm_guest_init(void) pv_ops.time.steal_clock = kvm_steal_clock; } - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -632,9 +652,7 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; - if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && - !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; pr_info("KVM setup pv sched yield\n"); } @@ -700,7 +718,7 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_init(void) { #if defined(CONFIG_SMP) - if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + if (pv_ipi_supported()) kvm_setup_pv_ipi(); #endif } @@ -732,26 +750,31 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); -static __init int kvm_setup_pv_tlb_flush(void) +static __init int kvm_alloc_cpumask(void) { int cpu; + bool alloc = false; if (!kvm_para_available() || nopv) return 0; - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + if (pv_tlb_flush_supported()) + alloc = true; + +#if defined(CONFIG_SMP) + if (pv_ipi_supported()) + alloc = true; +#endif + + if (alloc) for_each_possible_cpu(cpu) { - zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), GFP_KERNEL, cpu_to_node(cpu)); } - pr_info("KVM setup pv remote TLB flush\n"); - } return 0; } -arch_initcall(kvm_setup_pv_tlb_flush); +arch_initcall(kvm_alloc_cpumask); #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 904494b924c1..34b18f6eeb2c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -159,12 +159,19 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } +static int kvm_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); + return 0; +} + struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .enable = kvm_cs_enable, }; EXPORT_SYMBOL_GPL(kvm_clock); @@ -272,7 +279,7 @@ static int __init kvm_setup_vsyscall_timeinfo(void) if (!(flags & PVCLOCK_TSC_STABLE_BIT)) return 0; - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; #endif kvmclock_init_mem(); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c57e1ca70fd1..84c3ba32f211 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -27,7 +27,6 @@ #include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> -#include <asm/syscalls.h> #include <asm/pgtable_areas.h> /* This is a multiple of PAGE_SIZE. */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 54c21d6abd5a..6407ea21fa1b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -403,9 +403,9 @@ static void default_do_nmi(struct pt_regs *regs) * a 'real' unknown NMI. For example, while processing * a perf NMI another perf NMI comes in along with a * 'real' unknown NMI. These two NMIs get combined into - * one (as descibed above). When the next NMI gets + * one (as described above). When the next NMI gets * processed, it will be flagged by perf as handled, but - * noone will know that there was a 'real' unknown NMI sent + * no one will know that there was a 'real' unknown NMI sent * also. As a result it gets swallowed. Or if the first * perf NMI returns two events handled then the second * NMI will get eaten by the logic below, again losing a diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 789f5e4f89de..c131ba4e70ef 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -30,6 +30,7 @@ #include <asm/timer.h> #include <asm/special_insns.h> #include <asm/tlb.h> +#include <asm/io_bitmap.h> /* * nop stub, which must not clobber anything *including the stack* to @@ -341,6 +342,10 @@ struct paravirt_patch_template pv_ops = { .cpu.iret = native_iret, .cpu.swapgs = native_swapgs, +#ifdef CONFIG_X86_IOPL_IOPERM + .cpu.update_io_bitmap = native_tss_update_io_bitmap, +#endif + .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 839b5244e3b7..9da70b279dad 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,7 +28,6 @@ #include <linux/hw_breakpoint.h> #include <asm/cpu.h> #include <asm/apic.h> -#include <asm/syscalls.h> #include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> @@ -374,7 +373,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) /** * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode */ -void tss_update_io_bitmap(void) +void native_tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct thread_struct *t = ¤t->thread; @@ -650,6 +649,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } + + if ((tifp ^ tifn) & _TIF_SLD) + switch_to_sld(tifn); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5052ced43373..954b013cc585 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -49,7 +49,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ffd497804dbc..5ef9d8f25b0e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -48,7 +48,6 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 10125358b9c4..11065dc03f5b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -145,7 +145,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) { - WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)); pvti_cpu0_va = pvti; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0cc7c0b106bb..3ca43be4f9cf 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -531,7 +531,7 @@ static void emergency_vmx_disable_all(void) /* * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignore INIT + * we risk hanging up the machine, because the CPU ignores INIT * signals when VMX is enabled. * * We can't take any locks and we may be on an inconsistent diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ef3ba99068d3..a4d9a261425b 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -9,6 +9,8 @@ #include <asm/kexec.h> #include <asm/processor-flags.h> #include <asm/pgtable_types.h> +#include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> /* * Must be relocatable PIC code callable as a C function @@ -39,6 +41,7 @@ .align PAGE_SIZE .code64 SYM_CODE_START_NOALIGN(relocate_kernel) + UNWIND_HINT_EMPTY /* * %rdi indirection_page * %rsi page_list @@ -105,6 +108,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + UNWIND_HINT_EMPTY /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -192,14 +196,12 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) 1: popq %rdx leaq PAGE_SIZE(%r10), %rsp + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ movq 0(%rsp), %rbp - call 1f -1: - popq %r8 - subq $(1b - relocate_kernel), %r8 + leaq relocate_kernel(%rip), %r8 movq CP_PA_SWAP_PAGE(%r8), %r10 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi movq CP_PA_TABLE_PAGE(%r8), %rax @@ -212,6 +214,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + UNWIND_HINT_EMPTY movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -233,6 +236,7 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + UNWIND_HINT_EMPTY movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a74262c71484..e6b545047f38 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -64,7 +64,6 @@ RESERVE_BRK(dmi_alloc, 65536); * at link time, with RESERVE_BRK*() facility reserving additional * chunks. */ -static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8a29573851a3..83b74fb38c8f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -42,29 +42,9 @@ #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> -#include <asm/syscalls.h> - #include <asm/sigframe.h> #include <asm/signal.h> -#define COPY(x) do { \ - get_user_ex(regs->x, &sc->x); \ -} while (0) - -#define GET_SEG(seg) ({ \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - tmp; \ -}) - -#define COPY_SEG(seg) do { \ - regs->seg = GET_SEG(seg); \ -} while (0) - -#define COPY_SEG_CPL3(seg) do { \ - regs->seg = GET_SEG(seg) | 3; \ -} while (0) - #ifdef CONFIG_X86_64 /* * If regs->ss will cause an IRET fault, change it. Otherwise leave it @@ -92,53 +72,58 @@ static void force_valid_ss(struct pt_regs *regs) ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) regs->ss = __USER_DS; } +# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) +#else +# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *sc, + struct sigcontext __user *usc, unsigned long uc_flags) { - unsigned long buf_val; - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; + struct sigcontext sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - get_user_try { + if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) + return -EFAULT; #ifdef CONFIG_X86_32 - set_user_gs(regs, GET_SEG(gs)); - COPY_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); + set_user_gs(regs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; #endif /* CONFIG_X86_32 */ - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); COPY(ax); + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; #ifdef CONFIG_X86_64 - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; #endif /* CONFIG_X86_64 */ - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); - - get_user_ex(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; - get_user_ex(buf_val, &sc->fpstate); - buf = (void __user *)buf_val; - } get_user_catch(err); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; #ifdef CONFIG_X86_64 /* @@ -149,70 +134,78 @@ static int restore_sigcontext(struct pt_regs *regs, force_valid_ss(regs); #endif - err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); - - return err; + return fpu__restore_sig((void __user *)sc.fpstate, + IS_ENABLED(CONFIG_X86_32)); } -int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int err = 0; - - put_user_try { - #ifdef CONFIG_X86_32 - put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); - put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); - put_user_ex(regs->es, (unsigned int __user *)&sc->es); - put_user_ex(regs->ds, (unsigned int __user *)&sc->ds); + unsafe_put_user(get_user_gs(regs), + (unsigned int __user *)&sc->gs, Efault); + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); #endif /* CONFIG_X86_32 */ - put_user_ex(regs->di, &sc->di); - put_user_ex(regs->si, &sc->si); - put_user_ex(regs->bp, &sc->bp); - put_user_ex(regs->sp, &sc->sp); - put_user_ex(regs->bx, &sc->bx); - put_user_ex(regs->dx, &sc->dx); - put_user_ex(regs->cx, &sc->cx); - put_user_ex(regs->ax, &sc->ax); + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); #ifdef CONFIG_X86_64 - put_user_ex(regs->r8, &sc->r8); - put_user_ex(regs->r9, &sc->r9); - put_user_ex(regs->r10, &sc->r10); - put_user_ex(regs->r11, &sc->r11); - put_user_ex(regs->r12, &sc->r12); - put_user_ex(regs->r13, &sc->r13); - put_user_ex(regs->r14, &sc->r14); - put_user_ex(regs->r15, &sc->r15); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); #endif /* CONFIG_X86_64 */ - put_user_ex(current->thread.trap_nr, &sc->trapno); - put_user_ex(current->thread.error_code, &sc->err); - put_user_ex(regs->ip, &sc->ip); + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); #ifdef CONFIG_X86_32 - put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->sp, &sc->sp_at_signal); - put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); #else /* !CONFIG_X86_32 */ - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); - put_user_ex(regs->ss, &sc->ss); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); #endif /* CONFIG_X86_32 */ - put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate); - - /* non-iBCS2 extensions.. */ - put_user_ex(mask, &sc->oldmask); - put_user_ex(current->thread.cr2, &sc->cr2); - } put_user_catch(err); + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); - return err; + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; } +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + /* * Set up a signal frame. */ @@ -312,26 +305,16 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, { struct sigframe __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(frame, sizeof(*frame))) - return -EFAULT; + void __user *fp = NULL; - if (__put_user(sig, &frame->sig)) - return -EFAULT; + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - if (_NSIG_WORDS > 1) { - if (__copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask))) - return -EFAULT; - } - + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; @@ -341,7 +324,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ - err |= __put_user(restorer, &frame->pretcode); + unsafe_put_user(restorer, &frame->pretcode, Efault); /* * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 @@ -350,10 +333,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; + unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); + user_access_end(); /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -368,6 +349,10 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, regs->cs = __USER_CS; return 0; + +Efault: + user_access_end(); + return -EFAULT; } static int __setup_rt_frame(int sig, struct ksignal *ksig, @@ -375,50 +360,45 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; + void __user *fp = NULL; - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - put_user_try { - put_user_ex(sig, &frame->sig); - put_user_ex(&frame->info, &frame->pinfo); - put_user_ex(&frame->uc, &frame->puc); + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(&frame->info, &frame->pinfo, Efault); + unsafe_put_user(&frame->uc, &frame->puc, Efault); - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - save_altstack_ex(&frame->uc.uc_stack, regs->sp); + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - put_user_ex(restorer, &frame->pretcode); + /* Set up to return from userspace. */ + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, &frame->pretcode, Efault); - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); - } put_user_catch(err); + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) + if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ @@ -434,6 +414,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->cs = __USER_CS; return 0; +Efault: + user_access_end(); + return -EFAULT; } #else /* !CONFIG_X86_32 */ static unsigned long frame_uc_flags(struct pt_regs *regs) @@ -457,43 +440,34 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, struct rt_sigframe __user *frame; void __user *fp = NULL; unsigned long uc_flags; - int err = 0; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } - uc_flags = frame_uc_flags(regs); - - put_user_try { - /* Create the ucontext. */ - put_user_ex(uc_flags, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - save_altstack_ex(&frame->uc.uc_stack, regs->sp); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - err |= -EFAULT; - } - } put_user_catch(err); - - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) - return -EFAULT; - /* Set up registers for signal handler */ regs->di = sig; /* In case the signal handler was declared without prototypes */ @@ -530,6 +504,10 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, force_valid_ss(regs); return 0; + +Efault: + user_access_end(); + return -EFAULT; } #endif /* CONFIG_X86_32 */ @@ -541,44 +519,33 @@ static int x32_setup_rt_frame(struct ksignal *ksig, struct rt_sigframe_x32 __user *frame; unsigned long uc_flags; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + void __user *fp = NULL; - if (!access_ok(frame, sizeof(*frame))) + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) - return -EFAULT; - } + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); uc_flags = frame_uc_flags(regs); - put_user_try { - /* Create the ucontext. */ - put_user_ex(uc_flags, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); - put_user_ex(0, &frame->uc.uc__pad0); - - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = ksig->ka.sa.sa_restorer; - } else { - /* could use a vstub here */ - restorer = NULL; - err |= -EFAULT; - } - put_user_ex(restorer, (unsigned long __user *)&frame->pretcode); - } put_user_catch(err); + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); - if (err) - return -EFAULT; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + return -EFAULT; + } /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -597,6 +564,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, #endif /* CONFIG_X86_X32_ABI */ return 0; +#ifdef CONFIG_X86_X32_ABI +Efault: + user_access_end(); + return -EFAULT; +#endif } /* @@ -613,9 +585,8 @@ SYSCALL_DEFINE0(sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) + if (__get_user(set.sig[0], &frame->sc.oldmask) || + __get_user(set.sig[1], &frame->extramask[0])) goto badframe; set_current_blocked(&set); @@ -645,7 +616,7 @@ SYSCALL_DEFINE0(rt_sigreturn) frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; @@ -859,7 +830,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) } #ifdef CONFIG_X86_X32_ABI -asmlinkage long sys32_x32_rt_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; @@ -870,7 +841,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 69881b2d446c..fe3ab9632f3b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } +static void init_freq_invariance(void); + /* * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. @@ -183,6 +185,8 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); + init_freq_invariance(); + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to @@ -466,7 +470,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static const struct x86_cpu_id snc_cpu[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL), {} }; @@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - + init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { @@ -1434,7 +1438,7 @@ early_param("possible_cpus", _setup_possible_cpus); /* * cpu_possible_mask should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to + * are allocated by some modules at init time, and don't expect to * do this dynamically on cpu arrival/departure. * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current @@ -1764,3 +1768,287 @@ void native_play_dead(void) } #endif + +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static DEFINE_PER_CPU(u64, arch_prev_aperf); +static DEFINE_PER_CPU(u64, arch_prev_mperf); +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) +{ + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} + +static bool turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> + +#define ICPU(model) \ + {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_XEON_PHI_KNL), + ICPU(INTEL_FAM6_XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_ATOM_GOLDMONT), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + {} +}; + +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + if (!x86_match_cpu(has_knl_turbo_ratio_limits)) + return false; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */ + + return true; +} + +static bool intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: + arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, + base_freq); + arch_set_max_freq_ratio(turbo_disabled()); + return true; +} + +static void init_counter_refs(void *arg) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); +} + +static void init_freq_invariance(void) +{ + bool ret = false; + + if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + ret = intel_set_max_freq_ratio(); + + if (ret) { + on_each_cpu(init_counter_refs, NULL, 1); + static_branch_enable(&arch_scale_freq_key); + } else { + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); + } +} + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +void arch_scale_freq_tick(void) +{ + u64 freq_scale; + u64 aperf, mperf; + u64 acnt, mcnt; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + acnt = aperf - this_cpu_read(arch_prev_aperf); + mcnt = mperf - this_cpu_read(arch_prev_mperf); + if (!mcnt) + return; + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); + + acnt <<= 2*SCHED_CAPACITY_SHIFT; + mcnt *= arch_max_freq_ratio; + + freq_scale = div64_u64(acnt, mcnt); + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); +} diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2d6898c2cb64..6ad43fc44556 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -96,7 +96,8 @@ struct stack_frame_user { }; static int -copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) +copy_stack_frame(const struct stack_frame_user __user *fp, + struct stack_frame_user *frame) { int ret; @@ -105,7 +106,8 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) ret = 1; pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + if (__get_user(frame->next_fp, &fp->next_fp) || + __get_user(frame->ret_addr, &fp->ret_addr)) ret = 0; pagefault_enable(); diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index 21790307121e..ab03fede1422 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -51,20 +51,80 @@ #define AA(__x) ((unsigned long)(__x)) - -COMPAT_SYSCALL_DEFINE3(x86_truncate64, const char __user *, filename, - unsigned long, offset_low, unsigned long, offset_high) +SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) { return ksys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); } -COMPAT_SYSCALL_DEFINE3(x86_ftruncate64, unsigned int, fd, - unsigned long, offset_low, unsigned long, offset_high) +SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) { return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); } +/* warning: next two assume little endian */ +SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +/* + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) +{ + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) +{ + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); +} + +SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) +{ + return ksys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); +} + +SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); +} + +SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) +{ + return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} + +#ifdef CONFIG_IA32_EMULATION /* * Another set for IA32/LFS -- x86_64 struct stat is different due to * support for 64bit inode numbers. @@ -97,7 +157,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) return 0; } -COMPAT_SYSCALL_DEFINE2(x86_stat64, const char __user *, filename, +COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, struct stat64 __user *, statbuf) { struct kstat stat; @@ -108,7 +168,7 @@ COMPAT_SYSCALL_DEFINE2(x86_stat64, const char __user *, filename, return ret; } -COMPAT_SYSCALL_DEFINE2(x86_lstat64, const char __user *, filename, +COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename, struct stat64 __user *, statbuf) { struct kstat stat; @@ -118,7 +178,7 @@ COMPAT_SYSCALL_DEFINE2(x86_lstat64, const char __user *, filename, return ret; } -COMPAT_SYSCALL_DEFINE2(x86_fstat64, unsigned int, fd, +COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd, struct stat64 __user *, statbuf) { struct kstat stat; @@ -128,7 +188,7 @@ COMPAT_SYSCALL_DEFINE2(x86_fstat64, unsigned int, fd, return ret; } -COMPAT_SYSCALL_DEFINE4(x86_fstatat, unsigned int, dfd, +COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd, const char __user *, filename, struct stat64 __user *, statbuf, int, flag) { @@ -156,7 +216,7 @@ struct mmap_arg_struct32 { unsigned int offset; }; -COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg) +COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg) { struct mmap_arg_struct32 a; @@ -170,70 +230,10 @@ COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg) a.offset>>PAGE_SHIFT); } -/* warning: next two assume little endian */ -COMPAT_SYSCALL_DEFINE5(x86_pread, unsigned int, fd, char __user *, ubuf, - u32, count, u32, poslo, u32, poshi) -{ - return ksys_pread64(fd, ubuf, count, - ((loff_t)AA(poshi) << 32) | AA(poslo)); -} - -COMPAT_SYSCALL_DEFINE5(x86_pwrite, unsigned int, fd, const char __user *, ubuf, - u32, count, u32, poslo, u32, poshi) -{ - return ksys_pwrite64(fd, ubuf, count, - ((loff_t)AA(poshi) << 32) | AA(poslo)); -} - - -/* - * Some system calls that need sign extended arguments. This could be - * done by a generic wrapper. - */ -COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low, - __u32, offset_high, __u32, len_low, __u32, len_high, - int, advice) -{ - return ksys_fadvise64_64(fd, - (((u64)offset_high)<<32) | offset_low, - (((u64)len_high)<<32) | len_low, - advice); -} - -COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo, - unsigned int, off_hi, size_t, count) -{ - return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); -} - -COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low, - unsigned int, off_hi, unsigned int, n_low, - unsigned int, n_hi, int, flags) -{ - return ksys_sync_file_range(fd, - ((u64)off_hi << 32) | off_low, - ((u64)n_hi << 32) | n_low, flags); -} - -COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo, - unsigned int, offset_hi, size_t, len, int, advice) -{ - return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, - len, advice); -} - -COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode, - unsigned int, offset_lo, unsigned int, offset_hi, - unsigned int, len_lo, unsigned int, len_hi) -{ - return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, - ((u64)len_hi << 32) | len_lo); -} - /* * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS */ -COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, +COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls_val, int __user *, child_tidptr) { @@ -252,3 +252,4 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, return _do_fork(&args); } +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index ca3c11a17b5a..504fa5425bce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,7 +21,6 @@ #include <asm/elf.h> #include <asm/ia32.h> -#include <asm/syscalls.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index d8673d8a779b..106e7f87f534 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -62,19 +62,16 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - static void __init setup_default_timer_irq(void) { + unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER; + /* - * Unconditionally register the legacy timer; even without legacy - * PIC/PIT we need this for the HPET0 in legacy replacement mode. + * Unconditionally register the legacy timer interrupt; even + * without legacy PIC/PIT we need this for the HPET0 in legacy + * replacement mode. */ - if (setup_irq(0, &irq0)) + if (request_irq(0, timer_interrupt, flags, "timer", NULL)) pr_info("Failed to register legacy timer interrupt\n"); } @@ -122,18 +119,12 @@ void __init time_init(void) */ void clocksource_arch_init(struct clocksource *cs) { - if (cs->archdata.vclock_mode == VCLOCK_NONE) + if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE) return; - if (cs->archdata.vclock_mode > VCLOCK_MAX) { - pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n", - cs->name, cs->archdata.vclock_mode); - cs->archdata.vclock_mode = VCLOCK_NONE; - } - if (cs->mask != CLOCKSOURCE_MASK(64)) { - pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n", + pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n", cs->name, cs->mask); - cs->archdata.vclock_mode = VCLOCK_NONE; + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; } } diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index be5bc2e47c71..b8810ebbc8ae 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -59,39 +59,29 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug); */ int _debug_hotplug_cpu(int cpu, int action) { - struct device *dev = get_cpu_device(cpu); int ret; if (!cpu_is_hotpluggable(cpu)) return -EINVAL; - lock_device_hotplug(); - switch (action) { case 0: - ret = cpu_down(cpu); - if (!ret) { + ret = remove_cpu(cpu); + if (!ret) pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); - dev->offline = true; - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); - } else + else pr_debug("Can't offline CPU%d.\n", cpu); break; case 1: - ret = cpu_up(cpu); - if (!ret) { - dev->offline = false; - kobject_uevent(&dev->kobj, KOBJ_ONLINE); - } else { + ret = add_cpu(cpu); + if (ret) pr_debug("Can't online CPU%d.\n", cpu); - } + break; default: ret = -EINVAL; } - unlock_device_hotplug(); - return ret; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6ef00eb6fbb9..d54cffdc7cac 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -46,6 +46,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/fpu/internal.h> +#include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> @@ -242,7 +243,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; - if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; @@ -288,9 +288,29 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overru DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check) #undef IP +dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) +{ + char *str = "alignment check"; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + + if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) + return; + + if (!user_mode(regs)) + die("Split lock detected\n", regs, error_code); + + local_irq_enable(); + + if (handle_user_split_lock(regs, error_code)) + return; + + do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, + error_code, BUS_ADRALN, NULL); +} + #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, @@ -572,14 +592,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Use ist_enter despite the fact that we don't use an IST stack. - * We can be called from a kprobe in non-CONTEXT_KERNEL kernel - * mode or even during context tracking state changes. + * Unlike any other non-IST entry, we can be called from a kprobe in + * non-CONTEXT_KERNEL kernel mode or even during context tracking + * state changes. Make sure that we wake up RCU even if we're coming + * from kernel code. * - * This means that we can't schedule. That's okay. + * This means that we can't schedule even if we came from a + * preemptible kernel context. That's okay. */ - ist_enter(regs); + if (!user_mode(regs)) { + rcu_nmi_enter(); + preempt_disable(); + } RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -600,7 +626,10 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) cond_local_irq_disable(regs); exit: - ist_exit(regs); + if (!user_mode(regs)) { + preempt_enable_no_resched(); + rcu_nmi_exit(); + } } NOKPROBE_SYMBOL(do_int3); @@ -862,7 +891,25 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { - cond_local_irq_enable(regs); + /* + * This addresses a Pentium Pro Erratum: + * + * PROBLEM: If the APIC subsystem is configured in mixed mode with + * Virtual Wire mode implemented through the local APIC, an + * interrupt vector of 0Fh (Intel reserved encoding) may be + * generated by the local APIC (Int 15). This vector may be + * generated upon receipt of a spurious interrupt (an interrupt + * which is removed before the system receives the INTA sequence) + * instead of the programmed 8259 spurious interrupt vector. + * + * IMPLICATION: The spurious interrupt vector programmed in the + * 8259 is normally handled by an operating system's spurious + * interrupt handler. However, a vector of 0Fh is unknown to some + * operating systems, which would crash if this erratum occurred. + * + * In theory this could be limited to 32bit, but the handler is not + * hurting and who knows which other CPUs suffer from this. + */ } dotraplinkage void diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7e322e2daaf5..fdd4c1078632 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -477,7 +477,7 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty - * good value for the TSC frequencty. + * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { @@ -1108,17 +1108,24 @@ static void tsc_cs_tick_stable(struct clocksource *cs) sched_clock_tick_stable(); } +static int tsc_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_TSC); + return 0; +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { - .name = "tsc-early", - .rating = 299, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | + .name = "tsc-early", + .rating = 299, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, @@ -1131,14 +1138,15 @@ static struct clocksource clocksource_tsc_early = { * been found good. */ static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index e0cbe4f2af49..4fec6f3a1858 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -15,18 +15,46 @@ #include <asm/param.h> #include <asm/tsc.h> -#define MAX_NUM_FREQS 9 +#define MAX_NUM_FREQS 16 /* 4 bits to select the frequency */ + +/* + * The frequency numbers in the SDM are e.g. 83.3 MHz, which does not contain a + * lot of accuracy which leads to clock drift. As far as we know Bay Trail SoCs + * use a 25 MHz crystal and Cherry Trail uses a 19.2 MHz crystal, the crystal + * is the source clk for a root PLL which outputs 1600 and 100 MHz. It is + * unclear if the root PLL outputs are used directly by the CPU clock PLL or + * if there is another PLL in between. + * This does not matter though, we can model the chain of PLLs as a single PLL + * with a quotient equal to the quotients of all PLLs in the chain multiplied. + * So we can create a simplified model of the CPU clock setup using a reference + * clock of 100 MHz plus a quotient which gets us as close to the frequency + * from the SDM as possible. + * For the 83.3 MHz example from above this would give us 100 MHz * 5 / 6 = + * 83 and 1/3 MHz, which matches exactly what has been measured on actual hw. + */ +#define TSC_REFERENCE_KHZ 100000 + +struct muldiv { + u32 multiplier; + u32 divider; +}; /* * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the - * field msr_plat does. + * field use_msr_plat does. */ struct freq_desc { - u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ + bool use_msr_plat; + struct muldiv muldiv[MAX_NUM_FREQS]; + /* + * Some CPU frequencies in the SDM do not map to known PLL freqs, in + * that case the muldiv array is empty and the freqs array is used. + */ u32 freqs[MAX_NUM_FREQS]; + u32 mask; }; /* @@ -35,41 +63,91 @@ struct freq_desc { * by MSR based on SDM. */ static const struct freq_desc freq_desc_pnw = { - 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } + .use_msr_plat = false, + .freqs = { 0, 0, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, }; static const struct freq_desc freq_desc_clv = { - 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } + .use_msr_plat = false, + .freqs = { 0, 133200, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, }; +/* + * Bay Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 000: 100 * 5 / 6 = 83.3333 MHz + * 001: 100 * 1 / 1 = 100.0000 MHz + * 010: 100 * 4 / 3 = 133.3333 MHz + * 011: 100 * 7 / 6 = 116.6667 MHz + * 100: 100 * 4 / 5 = 80.0000 MHz + */ static const struct freq_desc freq_desc_byt = { - 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 } }, + .mask = 0x07, }; +/* + * Cherry Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 7 / 6 = 116.6667 MHz + * 0100: 100 * 4 / 5 = 80.0000 MHz + * 0101: 100 * 14 / 15 = 93.3333 MHz + * 0110: 100 * 9 / 10 = 90.0000 MHz + * 0111: 100 * 8 / 9 = 88.8889 MHz + * 1000: 100 * 7 / 8 = 87.5000 MHz + */ static const struct freq_desc freq_desc_cht = { - 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 }, { 14, 15 }, { 9, 10 }, { 8, 9 }, + { 7, 8 } }, + .mask = 0x0f, }; +/* + * Merriefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + */ static const struct freq_desc freq_desc_tng = { - 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 0, 0 }, { 1, 1 }, { 4, 3 } }, + .mask = 0x07, }; +/* + * Moorefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 1 / 1 = 100.0000 MHz + */ static const struct freq_desc freq_desc_ann = { - 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 1, 1 } }, + .mask = 0x0f, }; +/* 24 MHz crystal? : 24 * 13 / 4 = 78 MHz */ static const struct freq_desc freq_desc_lgm = { - 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 } + .use_msr_plat = true, + .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 }, + .mask = 0x0f, }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), - INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), - INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt), - INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), - INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), - INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), - INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; @@ -81,17 +159,19 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq; + u32 lo, hi, ratio, freq, tscref; const struct freq_desc *freq_desc; const struct x86_cpu_id *id; + const struct muldiv *md; unsigned long res; + int index; id = x86_match_cpu(tsc_msr_cpu_ids); if (!id) return 0; freq_desc = (struct freq_desc *)id->driver_data; - if (freq_desc->msr_plat) { + if (freq_desc->use_msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -101,12 +181,28 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); + index = lo & freq_desc->mask; + md = &freq_desc->muldiv[index]; - /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ - freq = freq_desc->freqs[lo & 0x7]; + /* + * Note this also catches cases where the index points to an unpopulated + * part of muldiv, in that case the else will set freq and res to 0. + */ + if (md->divider) { + tscref = TSC_REFERENCE_KHZ * md->multiplier; + freq = DIV_ROUND_CLOSEST(tscref, md->divider); + /* + * Multiplying by ratio before the division has better + * accuracy than just calculating freq * ratio. + */ + res = DIV_ROUND_CLOSEST(tscref * ratio, md->divider); + } else { + freq = freq_desc->freqs[index]; + res = freq * ratio; + } - /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ - res = freq * ratio; + if (freq == 0) + pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_period = (freq * 1000) / HZ; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 32a818764e03..3d3c761eb74a 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -295,7 +295,7 @@ static cycles_t check_tsc_warp(unsigned int timeout) * But as the TSC is per-logical CPU and can potentially be modified wrongly * by the bios, TSC sync test for smaller duration should be able * to catch such errors. Also this will catch the condition where all the - * cores in the socket doesn't get reset at the same time. + * cores in the socket don't get reset at the same time. */ static inline unsigned int loop_timeout(int cpu) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 91d55454e702..47a8676c7395 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -98,7 +98,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; - long err = 0; /* * This gets called from entry.S with interrupts disabled, but @@ -114,37 +113,30 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; - if (!access_ok(user, vm86->vm86plus.is_vm86pus ? + if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ? sizeof(struct vm86plus_struct) : - sizeof(struct vm86_struct))) { - pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); - } - - put_user_try { - put_user_ex(regs->pt.bx, &user->regs.ebx); - put_user_ex(regs->pt.cx, &user->regs.ecx); - put_user_ex(regs->pt.dx, &user->regs.edx); - put_user_ex(regs->pt.si, &user->regs.esi); - put_user_ex(regs->pt.di, &user->regs.edi); - put_user_ex(regs->pt.bp, &user->regs.ebp); - put_user_ex(regs->pt.ax, &user->regs.eax); - put_user_ex(regs->pt.ip, &user->regs.eip); - put_user_ex(regs->pt.cs, &user->regs.cs); - put_user_ex(regs->pt.flags, &user->regs.eflags); - put_user_ex(regs->pt.sp, &user->regs.esp); - put_user_ex(regs->pt.ss, &user->regs.ss); - put_user_ex(regs->es, &user->regs.es); - put_user_ex(regs->ds, &user->regs.ds); - put_user_ex(regs->fs, &user->regs.fs); - put_user_ex(regs->gs, &user->regs.gs); - - put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); - } put_user_catch(err); - if (err) { - pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); - } + sizeof(struct vm86_struct))) + goto Efault; + + unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end); + unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end); + unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end); + unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end); + unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end); + unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end); + unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end); + unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end); + unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end); + unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end); + unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end); + unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end); + unsafe_put_user(regs->es, &user->regs.es, Efault_end); + unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); + unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); + unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); + unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + user_access_end(); preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; @@ -159,6 +151,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) lazy_load_gs(vm86->regs32.gs); regs->pt.ax = retval; + return; + +Efault_end: + user_access_end(); +Efault: + pr_alert("could not access userspace vm86 info\n"); + do_exit(SIGSEGV); } static void mark_screen_rdonly(struct mm_struct *mm) @@ -243,6 +242,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) struct kernel_vm86_regs vm86regs; struct pt_regs *regs = current_pt_regs(); unsigned long err = 0; + struct vm86_struct v; err = security_mmap_addr(0); if (err) { @@ -278,39 +278,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) if (vm86->saved_sp0) return -EPERM; - if (!access_ok(user_vm86, plus ? - sizeof(struct vm86_struct) : - sizeof(struct vm86plus_struct))) + if (copy_from_user(&v, user_vm86, + offsetof(struct vm86_struct, int_revectored))) return -EFAULT; memset(&vm86regs, 0, sizeof(vm86regs)); - get_user_try { - unsigned short seg; - get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); - get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); - get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); - get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); - get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); - get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); - get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); - get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); - get_user_ex(seg, &user_vm86->regs.cs); - vm86regs.pt.cs = seg; - get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); - get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); - get_user_ex(seg, &user_vm86->regs.ss); - vm86regs.pt.ss = seg; - get_user_ex(vm86regs.es, &user_vm86->regs.es); - get_user_ex(vm86regs.ds, &user_vm86->regs.ds); - get_user_ex(vm86regs.fs, &user_vm86->regs.fs); - get_user_ex(vm86regs.gs, &user_vm86->regs.gs); - - get_user_ex(vm86->flags, &user_vm86->flags); - get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); - get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); - } get_user_catch(err); - if (err) - return err; + + vm86regs.pt.bx = v.regs.ebx; + vm86regs.pt.cx = v.regs.ecx; + vm86regs.pt.dx = v.regs.edx; + vm86regs.pt.si = v.regs.esi; + vm86regs.pt.di = v.regs.edi; + vm86regs.pt.bp = v.regs.ebp; + vm86regs.pt.ax = v.regs.eax; + vm86regs.pt.ip = v.regs.eip; + vm86regs.pt.cs = v.regs.cs; + vm86regs.pt.flags = v.regs.eflags; + vm86regs.pt.sp = v.regs.esp; + vm86regs.pt.ss = v.regs.ss; + vm86regs.es = v.regs.es; + vm86regs.ds = v.regs.ds; + vm86regs.fs = v.regs.fs; + vm86regs.gs = v.regs.gs; + + vm86->flags = v.flags; + vm86->screen_bitmap = v.screen_bitmap; + vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, &user_vm86->int_revectored, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e3296aa028fe..1bf7e312361f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,7 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define RUNTIME_DISCARD_EXIT #define EMITS_PT_NOTE #define RO_EXCEPTION_TABLE_ALIGN 16 @@ -313,8 +314,8 @@ SECTIONS . = ALIGN(8); /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame + * .exit.text is discarded at runtime, not link time, to deal with + * references from .altinstructions */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT @@ -412,9 +413,6 @@ SECTIONS DWARF_DEBUG DISCARDS - /DISCARD/ : { - *(.eh_frame) - } } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 991019d5eee1..9fea0757db92 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -59,6 +59,19 @@ config KVM If unsure, say N. +config KVM_WERROR + bool "Compile KVM with -Werror" + # KASAN may cause the build to fail due to larger frames + default y if X86_64 && !KASAN + # We use the dependency on !COMPILE_TEST to not be enabled + # blindly in allmodconfig or allyesconfig configurations + depends on (X86_64 && !KASAN) || !COMPILE_TEST + depends on EXPERT + help + Add -Werror to the build flags for KVM. + + If in doubt, say "N". + config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b19ef421084d..e553f0fdd87d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 ccflags-y += -Iarch/x86/kvm +ccflags-$(CONFIG_KVM_WERROR) += -Werror KVM := ../../../virt/kvm diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ddbc61984227..bc00642e5d3b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -191,25 +191,6 @@ #define NR_FASTOP (ilog2(sizeof(ulong)) + 1) #define FASTOP_SIZE 8 -/* - * fastop functions have a special calling convention: - * - * dst: rax (in/out) - * src: rdx (in/out) - * src2: rcx (in) - * flags: rflags (in/out) - * ex: rsi (in:fastop pointer, out:zero if exception) - * - * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for - * different operand sizes can be reached by calculation, rather than a jump - * table (which would be bigger than the code). - * - * fastop functions are declared as taking a never-defined fastop parameter, - * so they can't be called from C directly. - */ - -struct fastop; - struct opcode { u64 flags : 56; u64 intercept : 8; @@ -311,8 +292,19 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) #define ON64(x) #endif -typedef void (*fastop_t)(struct fastop *); - +/* + * fastop functions have a special calling convention: + * + * dst: rax (in/out) + * src: rdx (in/out) + * src2: rcx (in) + * flags: rflags (in/out) + * ex: rsi (in:fastop pointer, out:zero if exception) + * + * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for + * different operand sizes can be reached by calculation, rather than a jump + * table (which would be bigger than the code). + */ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); #define __FOP_FUNC(name) \ @@ -5181,6 +5173,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->fetch.ptr = ctxt->fetch.data; ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; + ctxt->intercept = x86_intercept_none; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); else { @@ -5683,7 +5676,7 @@ special_insn: if (ctxt->execute) { if (ctxt->d & Fastop) - rc = fastop(ctxt, (fastop_t)ctxt->execute); + rc = fastop(ctxt, ctxt->fop); else rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 7668fed1ce65..750ff0b29404 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -378,12 +378,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; - irq.shorthand = APIC_DEST_NOSHORT; irq.vector = e->fields.vector; irq.delivery_mode = e->fields.delivery_mode << 8; - irq.dest_id = e->fields.dest_id; irq.dest_mode = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); + irq.level = false; + irq.trig_mode = e->fields.trig_mode; + irq.shorthand = APIC_DEST_NOSHORT; + irq.dest_id = e->fields.dest_id; + irq.msi_redir_hint = false; bitmap_zero(&vcpu_bitmap, 16); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 79afa0bb5f41..c47d2acec529 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -417,7 +417,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.level && + if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, irq.dest_id, irq.dest_mode)) __set_bit(irq.vector, ioapic_handled_vectors); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index eafc631d305c..7356a56e6282 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -627,9 +627,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) { u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) + if (pv_eoi_get_user(vcpu, &val) < 0) { printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); + return false; + } return val & 0x1; } @@ -1046,11 +1048,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (vcpu->arch.apicv_active) - kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - else { + if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) { kvm_lapic_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } @@ -1080,9 +1079,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */ apic->pending_events = (1UL << KVM_APIC_INIT); - /* make sure pending_events is visible before sending - * the request */ - smp_wmb(); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } @@ -1449,6 +1445,8 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) } } +static void cancel_hv_timer(struct kvm_lapic *apic); + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1458,6 +1456,10 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -1719,7 +1721,7 @@ static void start_sw_period(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_HARD); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index d55674f44a18..a647601c9e1c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -102,6 +102,19 @@ static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) kvm_get_active_pcid(vcpu)); } +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault); + +static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 err, bool prefault) +{ +#ifdef CONFIG_RETPOLINE + if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault)) + return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault); +#endif + return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault); +} + /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7011a4e54866..87e9ba27ada1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4219,8 +4219,8 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault) { int max_level; @@ -4925,7 +4925,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) return; context->mmu_role.as_u64 = new_role.as_u64; - context->page_fault = tdp_page_fault; + context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -5436,9 +5436,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), - false); + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, + lower_32_bits(error_code), false); WARN_ON(r == RET_PF_INVALID); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4e1ef0473663..21a3320f166a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -33,7 +33,7 @@ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 - #define PT_MAX_FULL_LEVELS 4 + #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #define CMPXCHG cmpxchg #else #define CMPXCHG cmpxchg64 @@ -400,7 +400,7 @@ retry_walk: goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) + if (unlikely(__get_user(pte, ptep_user))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3c6522b84ff1..ffcd96fc02d0 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -339,7 +339,7 @@ TRACE_EVENT( /* These depend on page entry type, so compute them now. */ __field(bool, r) __field(bool, x) - __field(u8, u) + __field(signed char, u) ), TP_fast_assign( diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a3e32d61d60c..216364cb65a3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -48,6 +48,7 @@ #include <asm/kvm_para.h> #include <asm/irq_remapping.h> #include <asm/spec-ctrl.h> +#include <asm/cpu_device_id.h> #include <asm/virtext.h> #include "trace.h" @@ -57,11 +58,13 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +#ifdef MODULE static const struct x86_cpu_id svm_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_SVM), + X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); +#endif #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 @@ -1005,33 +1008,32 @@ static void svm_cpu_uninit(int cpu) static int svm_cpu_init(int cpu) { struct svm_cpu_data *sd; - int r; sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); if (!sd) return -ENOMEM; sd->cpu = cpu; - r = -ENOMEM; sd->save_area = alloc_page(GFP_KERNEL); if (!sd->save_area) - goto err_1; + goto free_cpu_data; if (svm_sev_enabled()) { - r = -ENOMEM; sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1, sizeof(void *), GFP_KERNEL); if (!sd->sev_vmcbs) - goto err_1; + goto free_save_area; } per_cpu(svm_data, cpu) = sd; return 0; -err_1: +free_save_area: + __free_page(sd->save_area); +free_cpu_data: kfree(sd); - return r; + return -ENOMEM; } @@ -1350,6 +1352,24 @@ static __init void svm_adjust_mmio_mask(void) kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); } +static void svm_hardware_teardown(void) +{ + int cpu; + + if (svm_sev_enabled()) { + bitmap_free(sev_asid_bitmap); + bitmap_free(sev_reclaim_asid_bitmap); + + sev_flush_asids(); + } + + for_each_possible_cpu(cpu) + svm_cpu_uninit(cpu); + + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + iopm_base = 0; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1463,29 +1483,10 @@ static __init int svm_hardware_setup(void) return 0; err: - __free_pages(iopm_pages, IOPM_ALLOC_ORDER); - iopm_base = 0; + svm_hardware_teardown(); return r; } -static __exit void svm_hardware_unsetup(void) -{ - int cpu; - - if (svm_sev_enabled()) { - bitmap_free(sev_asid_bitmap); - bitmap_free(sev_reclaim_asid_bitmap); - - sev_flush_asids(); - } - - for_each_possible_cpu(cpu) - svm_cpu_uninit(cpu); - - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); - iopm_base = 0; -} - static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; @@ -1933,14 +1934,6 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) static void __unregister_enc_region_locked(struct kvm *kvm, struct enc_region *region) { - /* - * The guest may change the memory encryption attribute from C=0 -> C=1 - * or vice versa for this memory range. Lets make sure caches are - * flushed to ensure that guest data gets written into memory with - * correct C-bit. - */ - sev_clflush_pages(region->pages, region->npages); - sev_unpin_memory(kvm, region->pages, region->npages); list_del(®ion->list); kfree(region); @@ -1971,6 +1964,13 @@ static void sev_vm_destroy(struct kvm *kvm) mutex_lock(&kvm->lock); /* + * Ensure that all guest tagged cache entries are flushed before + * releasing the pages back to the system for use. CLFLUSH will + * not do this, so issue a WBINVD. + */ + wbinvd_on_all_cpus(); + + /* * if userspace was terminated before unregistering the memory regions * then lets unpin all the registered memory. */ @@ -2175,7 +2175,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u32 dummy; u32 eax = 1; - vcpu->arch.microcode_version = 0x01000065; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; @@ -2197,8 +2196,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static int avic_init_vcpu(struct vcpu_svm *svm) { int ret; + struct kvm_vcpu *vcpu = &svm->vcpu; - if (!kvm_vcpu_apicv_active(&svm->vcpu)) + if (!avic || !irqchip_in_kernel(vcpu->kvm)) return 0; ret = avic_init_backing_page(&svm->vcpu); @@ -2266,6 +2266,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) init_vmcb(svm); svm_init_osvw(vcpu); + vcpu->arch.microcode_version = 0x01000065; return 0; @@ -5232,6 +5233,9 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; bool activated = kvm_vcpu_apicv_active(vcpu); + if (!avic) + return; + if (activated) { /** * During AVIC temporary deactivation, guest could update @@ -5255,8 +5259,11 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) +static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) { + if (!vcpu->arch.apicv_active) + return -1; + kvm_lapic_set_irr(vec, vcpu->arch.apic); smp_mb__after_atomic(); @@ -5268,6 +5275,8 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) put_cpu(); } else kvm_vcpu_wake_up(vcpu); + + return 0; } static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) @@ -6303,7 +6312,8 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, enum exit_fastpath_completion *exit_fastpath) { if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE) + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } @@ -7148,6 +7158,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) if (!svm_sev_enabled()) return -ENOTTY; + if (!argp) + return 0; + if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) return -EFAULT; @@ -7275,6 +7288,13 @@ static int svm_unregister_enc_region(struct kvm *kvm, goto failed; } + /* + * Ensure that all guest tagged cache entries are flushed before + * releasing the pages back to the system for use. CLFLUSH will + * not do this, so issue a WBINVD. + */ + wbinvd_on_all_cpus(); + __unregister_enc_region_locked(kvm, region); mutex_unlock(&kvm->lock); @@ -7378,7 +7398,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, - .hardware_unsetup = svm_hardware_unsetup, + .hardware_unsetup = svm_hardware_teardown, .check_processor_compatibility = svm_check_processor_compat, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, @@ -7433,6 +7453,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index f194dd058470..cef5a344fedb 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -815,8 +815,8 @@ TRACE_EVENT(kvm_write_tsc_offset, #ifdef CONFIG_X86_64 #define host_clocks \ - {VCLOCK_NONE, "none"}, \ - {VCLOCK_TSC, "tsc"} \ + {VDSO_CLOCKMODE_NONE, "none"}, \ + {VDSO_CLOCKMODE_TSC, "tsc"} \ TRACE_EVENT(kvm_update_master_clock, TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 283bdb7071af..f486e2606247 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -12,6 +12,7 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_apicv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 657c2eda357c..9750e590c89d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -224,7 +224,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) return; kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_vmptr = 0; vmx->nested.hv_evmcs = NULL; } @@ -544,7 +544,8 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, } } -static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { +static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) +{ int msr; for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { @@ -1922,7 +1923,8 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) return 1; - if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { + if (unlikely(!vmx->nested.hv_evmcs || + evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { if (!vmx->nested.hv_evmcs) vmx->nested.current_vmptr = -1ull; @@ -1981,7 +1983,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } /* - * Clean fields data can't de used on VMLAUNCH and when we switch + * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ if (from_launch || evmcs_gpa_changed) @@ -3160,10 +3162,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. * * Returns: - * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode - * NVMX_ENTRY_VMFAIL: Consistency check VMFail - * NVMX_ENTRY_VMEXIT: Consistency check VMExit - * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error + * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_VMENTRY_VMFAIL: Consistency check VMFail + * NVMX_VMENTRY_VMEXIT: Consistency check VMExit + * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error */ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) @@ -3575,25 +3577,80 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); } +/* + * Returns true if a debug trap is pending delivery. + * + * In KVM, debug traps bear an exception payload. As such, the class of a #DB + * exception may be inferred from the presence of an exception payload. + */ +static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending && + vcpu->arch.exception.nr == DB_VECTOR && + vcpu->arch.exception.payload; +} + +/* + * Certain VM-exits set the 'pending debug exceptions' field to indicate a + * recognized #DB (data or single-step) that has yet to be delivered. Since KVM + * represents these debug traps with a payload that is said to be compatible + * with the 'pending debug exceptions' field, write the payload to the VMCS + * field if a VM-exit is delivered before the debug trap. + */ +static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) +{ + if (vmx_pending_dbg_trap(vcpu)) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vcpu->arch.exception.payload); +} + static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; bool block_nested_events = vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); + bool mtf_pending = vmx->nested.mtf_pending; struct kvm_lapic *apic = vcpu->arch.apic; + /* + * Clear the MTF state. If a higher priority VM-exit is delivered first, + * this state is discarded. + */ + vmx->nested.mtf_pending = false; + if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); return 0; } + /* + * Process any exceptions that are not debug traps before MTF. + */ + if (vcpu->arch.exception.pending && + !vmx_pending_dbg_trap(vcpu) && + nested_vmx_check_exception(vcpu, &exit_qual)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + return 0; + } + + if (mtf_pending) { + if (block_nested_events) + return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); + nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); + return 0; + } + if (vcpu->arch.exception.pending && - nested_vmx_check_exception(vcpu, &exit_qual)) { + nested_vmx_check_exception(vcpu, &exit_qual)) { if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); @@ -5256,24 +5313,17 @@ fail: return 1; } - -static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Return true if an IO instruction with the specified port and size should cause + * a VM-exit into L1. + */ +bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, + int size) { - unsigned long exit_qualification; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gpa_t bitmap, last_bitmap; - unsigned int port; - int size; u8 b; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - last_bitmap = (gpa_t)-1; b = -1; @@ -5300,8 +5350,26 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + unsigned short port; + int size; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + return nested_vmx_check_io_bitmaps(vcpu, port, size); +} + /* - * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * Return 1 if we should exit from L2 to L1 to handle an MSR access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed * disinterest in the current event (read or write a specific MSR) by using an * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. @@ -5683,6 +5751,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx->nested.nested_run_pending) kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + + if (vmx->nested.mtf_pending) + kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; } } @@ -5863,6 +5934,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + vmx->nested.mtf_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); + ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { @@ -5920,8 +5994,7 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ -void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, - bool apicv) +void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) { /* * Note that as a general rule, the high half of the MSRs (bits in @@ -5948,7 +6021,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | - (apicv ? PIN_BASED_POSTED_INTR : 0); + (enable_apicv ? PIN_BASED_POSTED_INTR : 0); msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index fc874d4ead0f..9aeda46f473e 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -17,8 +17,7 @@ enum nvmx_vmentry_status { }; void vmx_leave_nested(struct kvm_vcpu *vcpu); -void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, - bool apicv); +void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); @@ -34,6 +33,8 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, + int size); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { @@ -175,6 +176,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12) +{ + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9a6664886f2e..458e684dfbdc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/asm.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/internal.h> @@ -41,6 +42,7 @@ #include <asm/mce.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/virtext.h> #include <asm/vmx.h> @@ -64,11 +66,13 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +#ifdef MODULE static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), + X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); +#endif bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); @@ -95,7 +99,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -static bool __read_mostly enable_apicv = 1; +bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); /* @@ -1175,6 +1179,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->guest_msrs[i].mask); } + + if (vmx->nested.need_vmcs12_to_shadow_sync) + nested_sync_vmcs12_to_shadow(vcpu); + if (vmx->guest_state_loaded) return; @@ -1599,6 +1607,40 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 1; } + +/* + * Recognizes a pending MTF VM-exit and records the nested state for later + * delivery. + */ +static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!is_guest_mode(vcpu)) + return; + + /* + * Per the SDM, MTF takes priority over debug-trap exceptions besides + * T-bit traps. As instruction emulation is completed (i.e. at the + * instruction boundary), any #DB exception pending delivery must be a + * debug-trap. Record the pending MTF state to be delivered in + * vmx_check_nested_events(). + */ + if (nested_cpu_has_mtf(vmcs12) && + (!vcpu->arch.exception.pending || + vcpu->arch.exception.nr == DB_VECTOR)) + vmx->nested.mtf_pending = true; + else + vmx->nested.mtf_pending = false; +} + +static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + vmx_update_emulated_instruction(vcpu); + return skip_emulated_instruction(vcpu); +} + static void vmx_clear_hlt(struct kvm_vcpu *vcpu) { /* @@ -2298,6 +2340,17 @@ static void hardware_disable(void) kvm_cpu_vmxoff(); } +/* + * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID + * directly instead of going through cpu_has(), to ensure KVM is trapping + * ENCLS whenever it's supported in hardware. It does not matter whether + * the host OS supports or has enabled SGX. + */ +static bool cpu_has_sgx(void) +{ + return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -2378,8 +2431,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_ENCLS_EXITING; + SECONDARY_EXEC_ENABLE_VMFUNC; + if (cpu_has_sgx()) + opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2947,6 +3001,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int get_ept_level(struct kvm_vcpu *vcpu) { + /* Nested EPT currently only supports 4-level walks. */ + if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) + return 4; if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; @@ -3815,24 +3872,29 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, * 2. If target vcpu isn't running(root mode), kick it to pick up the * interrupt from PIR in next vmentry. */ -static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) +static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { struct vcpu_vmx *vmx = to_vmx(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); if (!r) - return; + return 0; + + if (!vcpu->arch.apicv_active) + return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return; + return 0; /* If a previous notification has sent the IPI, nothing to do. */ if (pi_test_and_set_on(&vmx->pi_desc)) - return; + return 0; if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); + + return 0; } /* @@ -4238,7 +4300,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->msr_ia32_umwait_control = 0; - vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); @@ -6228,7 +6289,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) #endif ASM_CALL_CONSTRAINT : - THUNK_TARGET(entry), + [thunk_target]"r"(entry), [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); @@ -6480,8 +6541,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_write32(PLE_WINDOW, vmx->ple_window); } - if (vmx->nested.need_vmcs12_to_shadow_sync) - nested_sync_vmcs12_to_shadow(vcpu); + /* + * We did this in prepare_switch_to_guest, because it needs to + * be within srcu_read_lock. + */ + WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); @@ -6755,14 +6819,14 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - vmx_capability.ept, - kvm_vcpu_apicv_active(vcpu)); + vmx_capability.ept); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; + vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* @@ -6836,8 +6900,7 @@ static int __init vmx_check_processor_compat(void) if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) return -EIO; if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, - enable_apicv); + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); @@ -7098,6 +7161,40 @@ static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) to_vmx(vcpu)->req_immediate_exit = true; } +static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned short port; + bool intercept; + int size; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + port = info->src_val; + size = info->dst_bytes; + } else { + port = info->dst_val; + size = info->src_bytes; + } + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + intercept = nested_cpu_has(vmcs12, + CPU_BASED_UNCOND_IO_EXITING); + else + intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); + + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; +} + static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) @@ -7105,19 +7202,45 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + switch (info->intercept) { /* * RDPID causes #UD if disabled through secondary execution controls. * Because it is marked as EmulateOnUD, we need to intercept it here. */ - if (info->intercept == x86_intercept_rdtscp && - !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { - ctxt->exception.vector = UD_VECTOR; - ctxt->exception.error_code_valid = false; - return X86EMUL_PROPAGATE_FAULT; - } + case x86_intercept_rdtscp: + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + ctxt->exception.vector = UD_VECTOR; + ctxt->exception.error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + break; + + case x86_intercept_in: + case x86_intercept_ins: + case x86_intercept_out: + case x86_intercept_outs: + return vmx_check_intercept_io(vcpu, info); + + case x86_intercept_lgdt: + case x86_intercept_lidt: + case x86_intercept_lldt: + case x86_intercept_ltr: + case x86_intercept_sgdt: + case x86_intercept_sidt: + case x86_intercept_sldt: + case x86_intercept_str: + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) + return X86EMUL_CONTINUE; + + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + break; /* TODO: check more intercepts... */ - return X86EMUL_CONTINUE; + default: + break; + } + + return X86EMUL_UNHANDLEABLE; } #ifdef CONFIG_X86_64 @@ -7699,7 +7822,7 @@ static __init int hardware_setup(void) if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, - vmx_capability.ept, enable_apicv); + vmx_capability.ept); r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) @@ -7783,7 +7906,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 7f42cf3dcd70..0695ea177e22 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -14,8 +14,6 @@ extern const u32 vmx_msr_index[]; extern u64 host_efer; -extern u32 get_umwait_control_msr(void); - #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 @@ -150,6 +148,9 @@ struct nested_vmx { /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; + /* Pending MTF VM-exit into L1. */ + bool mtf_pending; + struct loaded_vmcs vmcs02; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbabb2f06273..bf8564d73fc3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -438,6 +438,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * for #DB exceptions under VMX. */ vcpu->arch.dr6 ^= payload & DR6_RTM; + + /* + * The #DB payload is defined as compatible with the 'pending + * debug exceptions' field under VMX, not DR6. While bit 12 is + * defined in the 'pending debug exceptions' field (enabled + * breakpoint), it is reserved and must be zero in DR6. + */ + vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: vcpu->arch.cr2 = payload; @@ -490,19 +498,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; - /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). However, for ABI - * compatibility with KVM_GET_VCPU_EVENTS and - * KVM_SET_VCPU_EVENTS, we can't delay payload - * delivery unless userspace has enabled this - * functionality via the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD. - */ - if (!vcpu->kvm->arch.exception_payload_enabled || - !is_guest_mode(vcpu)) + if (!is_guest_mode(vcpu)) kvm_deliver_exception_payload(vcpu); return; } @@ -1558,7 +1554,10 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); */ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) { - if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) + return 1; + + if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { @@ -1635,7 +1634,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; @@ -1643,7 +1642,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; vdata->clock.offset = tk->tkr_mono.base; - vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; @@ -1844,7 +1843,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) static inline int gtod_is_based_on_tsc(int mode) { - return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; + return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) @@ -1937,7 +1936,7 @@ static inline bool kvm_check_tsc_unstable(void) * TSC is marked unstable when we're running on Hyper-V, * 'TSC page' clocksource is good. */ - if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) return false; #endif return check_tsc_unstable(); @@ -2092,30 +2091,30 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, u64 tsc_pg_val; switch (clock->vclock_mode) { - case VCLOCK_HVCLOCK: + case VDSO_CLOCKMODE_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ - *mode = VCLOCK_HVCLOCK; + *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & clock->mask; } else { /* TSC page invalid */ - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } break; - case VCLOCK_TSC: - *mode = VCLOCK_TSC; + case VDSO_CLOCKMODE_TSC: + *mode = VDSO_CLOCKMODE_TSC; *tsc_timestamp = read_tsc(); v = (*tsc_timestamp - clock->cycle_last) & clock->mask; break; default: - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } - if (*mode == VCLOCK_NONE) + if (*mode == VDSO_CLOCKMODE_NONE) *tsc_timestamp = v = 0; return v * clock->mult; @@ -2448,7 +2447,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - WARN_ON(vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -3796,6 +3794,21 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, process_nmi(vcpu); /* + * In guest mode, payload delivery should be deferred, + * so that the L1 hypervisor can intercept #PF before + * CR2 is modified (or intercept #DB before DR6 is + * modified under nVMX). Unless the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of + * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we + * opportunistically defer the exception payload, deliver it if the + * capability hasn't been requested before processing a + * KVM_GET_VCPU_EVENTS. + */ + if (!vcpu->kvm->arch.exception_payload_enabled && + vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) + kvm_deliver_exception_payload(vcpu); + + /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP * isn't advanced, we should expect to encounter the exception @@ -6880,6 +6893,8 @@ restart: kvm_rip_write(vcpu, ctxt->eip); if (r && ctxt->tf) r = kvm_vcpu_do_singlestep(vcpu); + if (kvm_x86_ops->update_emulated_instruction) + kvm_x86_ops->update_emulated_instruction(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -7177,14 +7192,16 @@ static void kvm_timer_init(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; + struct cpufreq_policy *policy; int cpu; - memset(&policy, 0, sizeof(policy)); cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } put_cpu(); #endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, @@ -7295,12 +7312,12 @@ int kvm_arch_init(void *opaque) } if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support\n"); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: disabled by bios\n"); r = -EOPNOTSUPP; goto out; } @@ -8942,7 +8959,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -10182,7 +10198,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 53adc1762ec0..ec31f5b60323 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -366,7 +366,7 @@ AVXcode: 1 1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv 1c: Grp20 (1A),(1C) 1d: -1e: +1e: Grp21 (1A) 1f: NOP Ev # 0x0f 0x20-0x2f 20: MOV Rd,Cd @@ -803,8 +803,8 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) f3: Grp17 (1A) -f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) +f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) f9: MOVDIRI My,Gy @@ -970,7 +970,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: rdpkru (110),(11B) | wrpkru (111),(11B) +5: rdpkru (110),(11B) | wrpkru (111),(11B) | SAVEPREVSSP (F3),(010),(11B) | RSTORSSP Mq (F3) | SETSSBSY (F3),(000),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable @@ -1041,8 +1041,8 @@ GrpTable: Grp15 2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE | ptwrite Ey (F3),(11B) -5: XRSTOR | lfence (11B) -6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B) +5: XRSTOR | lfence (11B) | INCSSPD/Q Ry (F3),(11B) +6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B) | CLRSSBSY Mq (F3) 7: clflush | clflushopt (66) | sfence (11B) EndTable @@ -1077,6 +1077,11 @@ GrpTable: Grp20 0: cldemote Mb EndTable +GrpTable: Grp21 +1: RDSSPD/Q Ry (F3),(11B) +7: ENDBR64 (F3),(010),(11B) | ENDBR32 (F3),(011),(11B) +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 64229dad7eab..69309cd56fdf 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -363,13 +363,8 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, { const struct ptdump_range ptdump_ranges[] = { #ifdef CONFIG_X86_64 - -#define normalize_addr_shift (64 - (__VIRTUAL_MASK_SHIFT + 1)) -#define normalize_addr(u) ((signed long)((u) << normalize_addr_shift) >> \ - normalize_addr_shift) - {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, - {normalize_addr(PTRS_PER_PGD * PGD_LEVEL_MULT / 2), ~0UL}, + {GUARD_HOLE_END_ADDR, ~0UL}, #else {0, ~0UL}, #endif diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 30bb0bd3b1b8..b991aa4bdfae 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -80,18 +80,6 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_uaccess); -__visible bool ex_handler_ext(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - /* Special hack for uaccess_err */ - current->thread.uaccess_err = 1; - regs->ip = ex_fixup_addr(fixup); - return true; -} -EXPORT_SYMBOL(ex_handler_ext); - __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long error_code, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa4ea09593ab..629fdf13f846 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,7 +190,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -void vmalloc_sync_all(void) +static void vmalloc_sync(void) { unsigned long address; @@ -217,6 +217,16 @@ void vmalloc_sync_all(void) } } +void vmalloc_sync_mappings(void) +{ + vmalloc_sync(); +} + +void vmalloc_sync_unmappings(void) +{ + vmalloc_sync(); +} + /* * 32-bit: * @@ -319,11 +329,23 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_all(void) +void vmalloc_sync_mappings(void) { + /* + * 64-bit mappings might allocate new p4d/pud pages + * that need to be propagated to all tasks' PGDs. + */ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } +void vmalloc_sync_unmappings(void) +{ + /* + * Unmappings never allocate or free p4d/pud pages. + * No work is required here. + */ +} + /* * 64-bit: * diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 23df4885bbed..8ae0272c1c51 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -788,44 +788,6 @@ void __init mem_init(void) x86_init.hyper.init_after_bootmem(); mem_init_print_info(NULL); - printk(KERN_INFO "virtual kernel memory layout:\n" - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" - " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" -#ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -#endif - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, - - CPU_ENTRY_AREA_BASE, - CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, - CPU_ENTRY_AREA_MAP_SIZE >> 10, - -#ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, -#endif - - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, - - (unsigned long)__va(0), (unsigned long)high_memory, - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, - - (unsigned long)&__init_begin, (unsigned long)&__init_end, - ((unsigned long)&__init_end - - (unsigned long)&__init_begin) >> 10, - - (unsigned long)&_etext, (unsigned long)&_edata, - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, - - (unsigned long)&_text, (unsigned long)&_etext, - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); /* * Check boundaries twice: Some fundamental inconsistencies can diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index abbdecb75fad..0a14711d3a93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -818,8 +818,7 @@ void __init paging_init(void) * will not set it back. */ node_clear_state(0, N_MEMORY); - if (N_MEMORY != N_NORMAL_MEMORY) - node_clear_state(0, N_NORMAL_MEMORY); + node_clear_state(0, N_NORMAL_MEMORY); zone_sizes_init(); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 44e4beb4239f..18c637c0dc6f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -106,6 +106,22 @@ static unsigned int __ioremap_check_encrypted(struct resource *res) return 0; } +/* + * The EFI runtime services data area is not covered by walk_mem_res(), but must + * be mapped encrypted when SEV is active. + */ +static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc) +{ + if (!sev_active()) + return; + + if (!IS_ENABLED(CONFIG_EFI)) + return; + + if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA) + desc->flags |= IORES_MAP_ENCRYPTED; +} + static int __ioremap_collect_map_flags(struct resource *res, void *arg) { struct ioremap_desc *desc = arg; @@ -124,6 +140,9 @@ static int __ioremap_collect_map_flags(struct resource *res, void *arg) * To avoid multiple resource walks, this function walks resources marked as * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). + * + * After that, deal with misc other ranges in __ioremap_check_other() which do + * not fall into the above category. */ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, struct ioremap_desc *desc) @@ -135,6 +154,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, memset(desc, 0, sizeof(struct ioremap_desc)); walk_mem_res(start, end, desc, __ioremap_collect_map_flags); + + __ioremap_check_other(addr, desc); } /* diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 49d7814b59a9..9994353fb75d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -260,7 +260,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) goto no_kmmio; } - ctx = &get_cpu_var(kmmio_ctx); + ctx = this_cpu_ptr(&kmmio_ctx); if (ctx->active) { if (page_base == ctx->addr) { /* @@ -285,7 +285,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); disarm_kmmio_fault_page(faultpage); } - goto no_kmmio_ctx; + goto no_kmmio; } ctx->active++; @@ -314,11 +314,8 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * the user should drop to single cpu before tracing. */ - put_cpu_var(kmmio_ctx); return 1; /* fault handled */ -no_kmmio_ctx: - put_cpu_var(kmmio_ctx); no_kmmio: rcu_read_unlock(); preempt_enable_no_resched(); @@ -333,7 +330,7 @@ no_kmmio: static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) { int ret = 0; - struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx); if (!ctx->active) { /* @@ -371,7 +368,6 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (!(regs->flags & X86_EFLAGS_TF)) ret = 1; out: - put_cpu_var(kmmio_ctx); return ret; } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 673de6063345..109325d77b3e 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -386,7 +386,7 @@ static void enter_uniprocessor(void) put_online_cpus(); for_each_cpu(cpu, downed_cpus) { - err = cpu_down(cpu); + err = remove_cpu(cpu); if (!err) pr_info("CPU%d is down.\n", cpu); else @@ -406,7 +406,7 @@ static void leave_uniprocessor(void) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { - err = cpu_up(cpu); + err = add_cpu(cpu); if (!err) pr_info("enabled CPU%d.\n", cpu); else diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 7f1d2034df1e..c5174b4e318b 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -324,7 +324,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 0, NULL, NUMA_NO_NODE); } -int __init setup_emu2phys_nid(int *dfl_phys_nid) +static int __init setup_emu2phys_nid(int *dfl_phys_nid) { int i, max_emu_nid = 0; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index c4aedd00c1ba..6d5424069e2b 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -15,6 +15,7 @@ #include <linux/gfp.h> #include <linux/pci.h> #include <linux/vmalloc.h> +#include <linux/libnvdimm.h> #include <asm/e820/api.h> #include <asm/processor.h> @@ -304,11 +305,13 @@ void clflush_cache_range(void *vaddr, unsigned int size) } EXPORT_SYMBOL_GPL(clflush_cache_range); +#ifdef CONFIG_ARCH_HAS_PMEM_API void arch_invalidate_pmem(void *addr, size_t size) { clflush_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); +#endif static void __cpa_flush_all(void *arg) { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 44a9f068eee0..843aa10a4cb6 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -39,6 +39,7 @@ #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/sections.h> +#include <asm/set_memory.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt @@ -555,13 +556,6 @@ static inline bool pti_kernel_image_global_ok(void) } /* - * This is the only user for these and it is not arch-generic - * like the other set_memory.h functions. Just extern them. - */ -extern int set_memory_nonglobal(unsigned long addr, int numpages); -extern int set_memory_global(unsigned long addr, int numpages); - -/* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 393d251798c0..4d2a7a764602 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2039,10 +2039,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } /* and dreg_lo,sreg_lo */ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo)); - /* and dreg_hi,sreg_hi */ - EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); - /* or dreg_lo,dreg_hi */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); + if (is_jmp64) { + /* and dreg_hi,sreg_hi */ + EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); + /* or dreg_lo,dreg_hi */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); + } goto emit_cond_jmp; } case BPF_JMP | BPF_JSET | BPF_K: diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index ee6b0780bea1..f8ed5f66cd20 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -117,17 +117,16 @@ static void punit_dbgfs_unregister(void) debugfs_remove_recursive(punit_dbg_file); } -#define ICPU(model, drv_data) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\ - (kernel_ulong_t)&drv_data } +#define X86_MATCH(model, data) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ + X86_FEATURE_MWAIT, data) static const struct x86_cpu_id intel_punit_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), - ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), - ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), + X86_MATCH(ATOM_SILVERMONT, &punit_device_byt), + X86_MATCH(ATOM_SILVERMONT_MID, &punit_device_tng), + X86_MATCH(ATOM_AIRMONT, &punit_device_cht), {} }; - MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids); static int __init punit_atom_debug_init(void) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ae923ee8e2b4..1aae5302501d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -54,10 +54,16 @@ #include <asm/x86_init.h> #include <asm/uv/uv.h> -static efi_system_table_t efi_systab __initdata; -static u64 efi_systab_phys __initdata; +static unsigned long efi_systab_phys __initdata; +static unsigned long prop_phys = EFI_INVALID_TABLE_ADDR; +static unsigned long uga_phys = EFI_INVALID_TABLE_ADDR; +static unsigned long efi_runtime, efi_nr_tables; -static efi_config_table_type_t arch_tables[] __initdata = { +unsigned long efi_fw_vendor, efi_config_table; + +static const efi_config_table_type_t arch_tables[] __initconst = { + {EFI_PROPERTIES_TABLE_GUID, "PROP", &prop_phys}, + {UGA_IO_PROTOCOL_GUID, "UGA", &uga_phys}, #ifdef CONFIG_X86_UV {UV_SYSTEM_TABLE_GUID, "UVsystab", &uv_systab_phys}, #endif @@ -65,26 +71,26 @@ static efi_config_table_type_t arch_tables[] __initdata = { }; static const unsigned long * const efi_tables[] = { - &efi.mps, &efi.acpi, &efi.acpi20, &efi.smbios, &efi.smbios3, - &efi.boot_info, - &efi.hcdp, - &efi.uga, + &uga_phys, #ifdef CONFIG_X86_UV &uv_systab_phys, #endif - &efi.fw_vendor, - &efi.runtime, - &efi.config_table, + &efi_fw_vendor, + &efi_runtime, + &efi_config_table, &efi.esrt, - &efi.properties_table, - &efi.mem_attr_table, + &prop_phys, + &efi_mem_attr_table, #ifdef CONFIG_EFI_RCI2_TABLE &rci2_table_phys, #endif + &efi.tpm_log, + &efi.tpm_final_log, + &efi_rng_seed, }; u64 efi_setup; /* efi setup_data physical address */ @@ -214,16 +220,13 @@ int __init efi_memblock_x86_reserve_range(void) if (efi_enabled(EFI_PARAVIRT)) return 0; -#ifdef CONFIG_X86_32 - /* Can't handle data above 4GB at this time */ - if (e->efi_memmap_hi) { + /* Can't handle firmware tables above 4GB on i386 */ + if (IS_ENABLED(CONFIG_X86_32) && e->efi_memmap_hi > 0) { pr_err("Memory map is above 4GB, disabling EFI.\n"); return -EINVAL; } - pmap = e->efi_memmap; -#else - pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); -#endif + pmap = (phys_addr_t)(e->efi_memmap | ((u64)e->efi_memmap_hi << 32)); + data.phys_map = pmap; data.size = e->efi_memmap_size; data.desc_size = e->efi_memdesc_size; @@ -243,6 +246,7 @@ int __init efi_memblock_x86_reserve_range(void) efi.memmap.desc_version); memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size); + set_bit(EFI_PRESERVE_BS_REGIONS, &efi.flags); return 0; } @@ -305,11 +309,11 @@ static void __init efi_clean_memmap(void) if (n_removal > 0) { struct efi_memory_map_data data = { - .phys_map = efi.memmap.phys_map, - .desc_version = efi.memmap.desc_version, - .desc_size = efi.memmap.desc_size, - .size = efi.memmap.desc_size * (efi.memmap.nr_map - n_removal), - .flags = 0, + .phys_map = efi.memmap.phys_map, + .desc_version = efi.memmap.desc_version, + .desc_size = efi.memmap.desc_size, + .size = efi.memmap.desc_size * (efi.memmap.nr_map - n_removal), + .flags = 0, }; pr_warn("Removing %d invalid memory map entries.\n", n_removal); @@ -333,43 +337,32 @@ void __init efi_print_memmap(void) } } -static int __init efi_systab_init(u64 phys) +static int __init efi_systab_init(unsigned long phys) { int size = efi_enabled(EFI_64BIT) ? sizeof(efi_system_table_64_t) : sizeof(efi_system_table_32_t); + const efi_table_hdr_t *hdr; bool over4g = false; void *p; + int ret; - p = early_memremap_ro(phys, size); + hdr = p = early_memremap_ro(phys, size); if (p == NULL) { pr_err("Couldn't map the system table!\n"); return -ENOMEM; } + ret = efi_systab_check_header(hdr, 1); + if (ret) { + early_memunmap(p, size); + return ret; + } + if (efi_enabled(EFI_64BIT)) { const efi_system_table_64_t *systab64 = p; - efi_systab.hdr = systab64->hdr; - efi_systab.fw_vendor = systab64->fw_vendor; - efi_systab.fw_revision = systab64->fw_revision; - efi_systab.con_in_handle = systab64->con_in_handle; - efi_systab.con_in = systab64->con_in; - efi_systab.con_out_handle = systab64->con_out_handle; - efi_systab.con_out = (void *)(unsigned long)systab64->con_out; - efi_systab.stderr_handle = systab64->stderr_handle; - efi_systab.stderr = systab64->stderr; - efi_systab.runtime = (void *)(unsigned long)systab64->runtime; - efi_systab.boottime = (void *)(unsigned long)systab64->boottime; - efi_systab.nr_tables = systab64->nr_tables; - efi_systab.tables = systab64->tables; - - over4g = systab64->con_in_handle > U32_MAX || - systab64->con_in > U32_MAX || - systab64->con_out_handle > U32_MAX || - systab64->con_out > U32_MAX || - systab64->stderr_handle > U32_MAX || - systab64->stderr > U32_MAX || - systab64->boottime > U32_MAX; + efi_runtime = systab64->runtime; + over4g = systab64->runtime > U32_MAX; if (efi_setup) { struct efi_setup_data *data; @@ -380,38 +373,33 @@ static int __init efi_systab_init(u64 phys) return -ENOMEM; } - efi_systab.fw_vendor = (unsigned long)data->fw_vendor; - efi_systab.runtime = (void *)(unsigned long)data->runtime; - efi_systab.tables = (unsigned long)data->tables; + efi_fw_vendor = (unsigned long)data->fw_vendor; + efi_config_table = (unsigned long)data->tables; over4g |= data->fw_vendor > U32_MAX || - data->runtime > U32_MAX || data->tables > U32_MAX; early_memunmap(data, sizeof(*data)); } else { + efi_fw_vendor = systab64->fw_vendor; + efi_config_table = systab64->tables; + over4g |= systab64->fw_vendor > U32_MAX || - systab64->runtime > U32_MAX || systab64->tables > U32_MAX; } + efi_nr_tables = systab64->nr_tables; } else { const efi_system_table_32_t *systab32 = p; - efi_systab.hdr = systab32->hdr; - efi_systab.fw_vendor = systab32->fw_vendor; - efi_systab.fw_revision = systab32->fw_revision; - efi_systab.con_in_handle = systab32->con_in_handle; - efi_systab.con_in = systab32->con_in; - efi_systab.con_out_handle = systab32->con_out_handle; - efi_systab.con_out = (void *)(unsigned long)systab32->con_out; - efi_systab.stderr_handle = systab32->stderr_handle; - efi_systab.stderr = systab32->stderr; - efi_systab.runtime = (void *)(unsigned long)systab32->runtime; - efi_systab.boottime = (void *)(unsigned long)systab32->boottime; - efi_systab.nr_tables = systab32->nr_tables; - efi_systab.tables = systab32->tables; + efi_fw_vendor = systab32->fw_vendor; + efi_runtime = systab32->runtime; + efi_config_table = systab32->tables; + efi_nr_tables = systab32->nr_tables; } + efi.runtime_version = hdr->revision; + + efi_systab_report_header(hdr, efi_fw_vendor); early_memunmap(p, size); if (IS_ENABLED(CONFIG_X86_32) && over4g) { @@ -419,29 +407,40 @@ static int __init efi_systab_init(u64 phys) return -EINVAL; } - efi.systab = &efi_systab; + return 0; +} + +static int __init efi_config_init(const efi_config_table_type_t *arch_tables) +{ + void *config_tables; + int sz, ret; + + if (efi_nr_tables == 0) + return 0; + + if (efi_enabled(EFI_64BIT)) + sz = sizeof(efi_config_table_64_t); + else + sz = sizeof(efi_config_table_32_t); /* - * Verify the EFI Table + * Let's see what config tables the firmware passed to us. */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { - pr_err("System table signature incorrect!\n"); - return -EINVAL; + config_tables = early_memremap(efi_config_table, efi_nr_tables * sz); + if (config_tables == NULL) { + pr_err("Could not map Configuration table!\n"); + return -ENOMEM; } - if ((efi.systab->hdr.revision >> 16) == 0) - pr_err("Warning: System table version %d.%02d, expected 1.00 or greater!\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - return 0; + ret = efi_config_parse_tables(config_tables, efi_nr_tables, + arch_tables); + + early_memunmap(config_tables, efi_nr_tables * sz); + return ret; } void __init efi_init(void) { - efi_char16_t *c16; - char vendor[100] = "unknown"; - int i = 0; - if (IS_ENABLED(CONFIG_X86_32) && (boot_params.efi_info.efi_systab_hi || boot_params.efi_info.efi_memmap_hi)) { @@ -455,29 +454,7 @@ void __init efi_init(void) if (efi_systab_init(efi_systab_phys)) return; - efi.config_table = (unsigned long)efi.systab->tables; - efi.fw_vendor = (unsigned long)efi.systab->fw_vendor; - efi.runtime = (unsigned long)efi.systab->runtime; - - /* - * Show what we know for posterity - */ - c16 = early_memremap_ro(efi.systab->fw_vendor, - sizeof(vendor) * sizeof(efi_char16_t)); - if (c16) { - for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i) - vendor[i] = c16[i]; - vendor[i] = '\0'; - early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); - } else { - pr_err("Could not map the firmware vendor!\n"); - } - - pr_info("EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables)) + if (efi_reuse_config(efi_config_table, efi_nr_tables)) return; if (efi_config_init(arch_tables)) @@ -496,6 +473,22 @@ void __init efi_init(void) return; } + /* Parse the EFI Properties table if it exists */ + if (prop_phys != EFI_INVALID_TABLE_ADDR) { + efi_properties_table_t *tbl; + + tbl = early_memremap_ro(prop_phys, sizeof(*tbl)); + if (tbl == NULL) { + pr_err("Could not map Properties table!\n"); + } else { + if (tbl->memory_protection_attribute & + EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA) + set_bit(EFI_NX_PE_DATA, &efi.flags); + + early_memunmap(tbl, sizeof(*tbl)); + } + } + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); efi_clean_memmap(); @@ -602,20 +595,6 @@ static void __init efi_merge_regions(void) } } -static void __init get_systab_virt_addr(efi_memory_desc_t *md) -{ - unsigned long size; - u64 end, systab; - - size = md->num_pages << EFI_PAGE_SHIFT; - end = md->phys_addr + size; - systab = efi_systab_phys; - if (md->phys_addr <= systab && systab < end) { - systab += md->virt_addr - md->phys_addr; - efi.systab = (efi_system_table_t *)(unsigned long)systab; - } -} - static void *realloc_pages(void *old_memmap, int old_shift) { void *ret; @@ -771,7 +750,6 @@ static void * __init efi_map_regions(int *count, int *pg_shift) continue; efi_map_region(md); - get_systab_virt_addr(md); if (left < desc_size) { new_memmap = realloc_pages(new_memmap, *pg_shift); @@ -797,8 +775,6 @@ static void __init kexec_enter_virtual_mode(void) efi_memory_desc_t *md; unsigned int num_pages; - efi.systab = NULL; - /* * We don't do virtual mode, since we don't do runtime services, on * non-native EFI. With the UV1 memmap, we don't do runtime services in @@ -821,10 +797,8 @@ static void __init kexec_enter_virtual_mode(void) * Map efi regions which were passed via setup_data. The virt_addr is a * fixed addr which was used in first kernel of a kexec boot. */ - for_each_efi_memory_desc(md) { + for_each_efi_memory_desc(md) efi_map_region_fixed(md); /* FIXME: add error handling */ - get_systab_virt_addr(md); - } /* * Unregister the early EFI memmap from efi_init() and install @@ -839,8 +813,6 @@ static void __init kexec_enter_virtual_mode(void) return; } - BUG_ON(!efi.systab); - num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE); num_pages >>= PAGE_SHIFT; @@ -850,15 +822,6 @@ static void __init kexec_enter_virtual_mode(void) } efi_sync_low_kernel_mappings(); - - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - * - * Call EFI services through wrapper functions. - */ - efi.runtime_version = efi_systab.hdr.revision; - efi_native_runtime_setup(); #endif } @@ -892,8 +855,6 @@ static void __init __efi_enter_virtual_mode(void) efi_status_t status; unsigned long pa; - efi.systab = NULL; - if (efi_alloc_page_tables()) { pr_err("Failed to allocate EFI page tables\n"); goto err; @@ -925,9 +886,6 @@ static void __init __efi_enter_virtual_mode(void) efi_print_memmap(); } - if (WARN_ON(!efi.systab)) - goto err; - if (efi_setup_page_tables(pa, 1 << pg_shift)) goto err; @@ -936,23 +894,17 @@ static void __init __efi_enter_virtual_mode(void) status = efi_set_virtual_address_map(efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)pa); + (efi_memory_desc_t *)pa, + efi_systab_phys); if (status != EFI_SUCCESS) { pr_err("Unable to switch EFI into virtual mode (status=%lx)!\n", status); goto err; } + efi_check_for_embedded_firmwares(); efi_free_boot_services(); - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - * - * Call EFI services through wrapper functions. - */ - efi.runtime_version = efi_systab.hdr.revision; - if (!efi_is_mixed()) efi_native_runtime_setup(); else @@ -978,6 +930,8 @@ void __init efi_enter_virtual_mode(void) if (efi_enabled(EFI_PARAVIRT)) return; + efi.runtime = (efi_runtime_services_t *)efi_runtime; + if (efi_setup) kexec_enter_virtual_mode(); else @@ -999,3 +953,43 @@ bool efi_is_table_address(unsigned long phys_addr) return false; } + +char *efi_systab_show_arch(char *str) +{ + if (uga_phys != EFI_INVALID_TABLE_ADDR) + str += sprintf(str, "UGA=0x%lx\n", uga_phys); + return str; +} + +#define EFI_FIELD(var) efi_ ## var + +#define EFI_ATTR_SHOW(name) \ +static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "0x%lx\n", EFI_FIELD(name)); \ +} + +EFI_ATTR_SHOW(fw_vendor); +EFI_ATTR_SHOW(runtime); +EFI_ATTR_SHOW(config_table); + +struct kobj_attribute efi_attr_fw_vendor = __ATTR_RO(fw_vendor); +struct kobj_attribute efi_attr_runtime = __ATTR_RO(runtime); +struct kobj_attribute efi_attr_config_table = __ATTR_RO(config_table); + +umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n) +{ + if (attr == &efi_attr_fw_vendor.attr) { + if (efi_enabled(EFI_PARAVIRT) || + efi_fw_vendor == EFI_INVALID_TABLE_ADDR) + return 0; + } else if (attr == &efi_attr_runtime.attr) { + if (efi_runtime == EFI_INVALID_TABLE_ADDR) + return 0; + } else if (attr == &efi_attr_config_table.attr) { + if (efi_config_table == EFI_INVALID_TABLE_ADDR) + return 0; + } + return attr->mode; +} diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 081d466002c9..c049c432745d 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -66,14 +66,16 @@ void __init efi_map_region(efi_memory_desc_t *md) void __init efi_map_region_fixed(efi_memory_desc_t *md) {} void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} -efi_status_t efi_call_svam(efi_set_virtual_address_map_t *__efiapi *, - u32, u32, u32, void *); +efi_status_t efi_call_svam(efi_runtime_services_t * const *, + u32, u32, u32, void *, u32); efi_status_t __init efi_set_virtual_address_map(unsigned long memory_map_size, unsigned long descriptor_size, u32 descriptor_version, - efi_memory_desc_t *virtual_map) + efi_memory_desc_t *virtual_map, + unsigned long systab_phys) { + const efi_system_table_t *systab = (efi_system_table_t *)systab_phys; struct desc_ptr gdt_descr; efi_status_t status; unsigned long flags; @@ -90,9 +92,10 @@ efi_status_t __init efi_set_virtual_address_map(unsigned long memory_map_size, /* Disable interrupts around EFI calls: */ local_irq_save(flags); - status = efi_call_svam(&efi.systab->runtime->set_virtual_address_map, + status = efi_call_svam(&systab->runtime, memory_map_size, descriptor_size, - descriptor_version, virtual_map); + descriptor_version, virtual_map, + __pa(&efi.runtime)); local_irq_restore(flags); load_fixmap_gdt(0); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index fa8506e76bbe..211bb9358b73 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -180,7 +180,7 @@ void efi_sync_low_kernel_mappings(void) static inline phys_addr_t virt_to_phys_or_null_size(void *va, unsigned long size) { - bool bad_size; + phys_addr_t pa; if (!va) return 0; @@ -188,16 +188,13 @@ virt_to_phys_or_null_size(void *va, unsigned long size) if (virt_addr_valid(va)) return virt_to_phys(va); - /* - * A fully aligned variable on the stack is guaranteed not to - * cross a page bounary. Try to catch strings on the stack by - * checking that 'size' is a power of two. - */ - bad_size = size > PAGE_SIZE || !is_power_of_2(size); + pa = slow_virt_to_phys(va); - WARN_ON(!IS_ALIGNED((unsigned long)va, size) || bad_size); + /* check if the object crosses a page boundary */ + if (WARN_ON((pa ^ (pa + size - 1)) & PAGE_MASK)) + return 0; - return slow_virt_to_phys(va); + return pa; } #define virt_to_phys_or_null(addr) \ @@ -500,12 +497,9 @@ static DEFINE_SPINLOCK(efi_runtime_lock); */ #define __efi_thunk(func, ...) \ ({ \ - efi_runtime_services_32_t *__rt; \ unsigned short __ds, __es; \ efi_status_t ____s; \ \ - __rt = (void *)(unsigned long)efi.systab->mixed_mode.runtime; \ - \ savesegment(ds, __ds); \ savesegment(es, __es); \ \ @@ -513,7 +507,7 @@ static DEFINE_SPINLOCK(efi_runtime_lock); loadsegment(ds, __KERNEL_DS); \ loadsegment(es, __KERNEL_DS); \ \ - ____s = efi64_thunk(__rt->func, __VA_ARGS__); \ + ____s = efi64_thunk(efi.runtime->mixed_mode.func, __VA_ARGS__); \ \ loadsegment(ds, __ds); \ loadsegment(es, __es); \ @@ -568,85 +562,25 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size, static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc) { - efi_status_t status; - u32 phys_tm, phys_tc; - unsigned long flags; - - spin_lock(&rtc_lock); - spin_lock_irqsave(&efi_runtime_lock, flags); - - phys_tm = virt_to_phys_or_null(tm); - phys_tc = virt_to_phys_or_null(tc); - - status = efi_thunk(get_time, phys_tm, phys_tc); - - spin_unlock_irqrestore(&efi_runtime_lock, flags); - spin_unlock(&rtc_lock); - - return status; + return EFI_UNSUPPORTED; } static efi_status_t efi_thunk_set_time(efi_time_t *tm) { - efi_status_t status; - u32 phys_tm; - unsigned long flags; - - spin_lock(&rtc_lock); - spin_lock_irqsave(&efi_runtime_lock, flags); - - phys_tm = virt_to_phys_or_null(tm); - - status = efi_thunk(set_time, phys_tm); - - spin_unlock_irqrestore(&efi_runtime_lock, flags); - spin_unlock(&rtc_lock); - - return status; + return EFI_UNSUPPORTED; } static efi_status_t efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm) { - efi_status_t status; - u32 phys_enabled, phys_pending, phys_tm; - unsigned long flags; - - spin_lock(&rtc_lock); - spin_lock_irqsave(&efi_runtime_lock, flags); - - phys_enabled = virt_to_phys_or_null(enabled); - phys_pending = virt_to_phys_or_null(pending); - phys_tm = virt_to_phys_or_null(tm); - - status = efi_thunk(get_wakeup_time, phys_enabled, - phys_pending, phys_tm); - - spin_unlock_irqrestore(&efi_runtime_lock, flags); - spin_unlock(&rtc_lock); - - return status; + return EFI_UNSUPPORTED; } static efi_status_t efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { - efi_status_t status; - u32 phys_tm; - unsigned long flags; - - spin_lock(&rtc_lock); - spin_lock_irqsave(&efi_runtime_lock, flags); - - phys_tm = virt_to_phys_or_null(tm); - - status = efi_thunk(set_wakeup_time, enabled, phys_tm); - - spin_unlock_irqrestore(&efi_runtime_lock, flags); - spin_unlock(&rtc_lock); - - return status; + return EFI_UNSUPPORTED; } static unsigned long efi_name_size(efi_char16_t *name) @@ -658,6 +592,8 @@ static efi_status_t efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, u32 *attr, unsigned long *data_size, void *data) { + u8 buf[24] __aligned(8); + efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd)); efi_status_t status; u32 phys_name, phys_vendor, phys_attr; u32 phys_data_size, phys_data; @@ -665,14 +601,19 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, spin_lock_irqsave(&efi_runtime_lock, flags); + *vnd = *vendor; + phys_data_size = virt_to_phys_or_null(data_size); - phys_vendor = virt_to_phys_or_null(vendor); + phys_vendor = virt_to_phys_or_null(vnd); phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); phys_attr = virt_to_phys_or_null(attr); phys_data = virt_to_phys_or_null_size(data, *data_size); - status = efi_thunk(get_variable, phys_name, phys_vendor, - phys_attr, phys_data_size, phys_data); + if (!phys_name || (data && !phys_data)) + status = EFI_INVALID_PARAMETER; + else + status = efi_thunk(get_variable, phys_name, phys_vendor, + phys_attr, phys_data_size, phys_data); spin_unlock_irqrestore(&efi_runtime_lock, flags); @@ -683,19 +624,25 @@ static efi_status_t efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data) { + u8 buf[24] __aligned(8); + efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd)); u32 phys_name, phys_vendor, phys_data; efi_status_t status; unsigned long flags; spin_lock_irqsave(&efi_runtime_lock, flags); + *vnd = *vendor; + phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); - phys_vendor = virt_to_phys_or_null(vendor); + phys_vendor = virt_to_phys_or_null(vnd); phys_data = virt_to_phys_or_null_size(data, data_size); - /* If data_size is > sizeof(u32) we've got problems */ - status = efi_thunk(set_variable, phys_name, phys_vendor, - attr, data_size, phys_data); + if (!phys_name || !phys_data) + status = EFI_INVALID_PARAMETER; + else + status = efi_thunk(set_variable, phys_name, phys_vendor, + attr, data_size, phys_data); spin_unlock_irqrestore(&efi_runtime_lock, flags); @@ -707,6 +654,8 @@ efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data) { + u8 buf[24] __aligned(8); + efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd)); u32 phys_name, phys_vendor, phys_data; efi_status_t status; unsigned long flags; @@ -714,13 +663,17 @@ efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) return EFI_NOT_READY; + *vnd = *vendor; + phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); - phys_vendor = virt_to_phys_or_null(vendor); + phys_vendor = virt_to_phys_or_null(vnd); phys_data = virt_to_phys_or_null_size(data, data_size); - /* If data_size is > sizeof(u32) we've got problems */ - status = efi_thunk(set_variable, phys_name, phys_vendor, - attr, data_size, phys_data); + if (!phys_name || !phys_data) + status = EFI_INVALID_PARAMETER; + else + status = efi_thunk(set_variable, phys_name, phys_vendor, + attr, data_size, phys_data); spin_unlock_irqrestore(&efi_runtime_lock, flags); @@ -732,39 +685,36 @@ efi_thunk_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { + u8 buf[24] __aligned(8); + efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd)); efi_status_t status; u32 phys_name_size, phys_name, phys_vendor; unsigned long flags; spin_lock_irqsave(&efi_runtime_lock, flags); + *vnd = *vendor; + phys_name_size = virt_to_phys_or_null(name_size); - phys_vendor = virt_to_phys_or_null(vendor); + phys_vendor = virt_to_phys_or_null(vnd); phys_name = virt_to_phys_or_null_size(name, *name_size); - status = efi_thunk(get_next_variable, phys_name_size, - phys_name, phys_vendor); + if (!phys_name) + status = EFI_INVALID_PARAMETER; + else + status = efi_thunk(get_next_variable, phys_name_size, + phys_name, phys_vendor); spin_unlock_irqrestore(&efi_runtime_lock, flags); + *vendor = *vnd; return status; } static efi_status_t efi_thunk_get_next_high_mono_count(u32 *count) { - efi_status_t status; - u32 phys_count; - unsigned long flags; - - spin_lock_irqsave(&efi_runtime_lock, flags); - - phys_count = virt_to_phys_or_null(count); - status = efi_thunk(get_next_high_mono_count, phys_count); - - spin_unlock_irqrestore(&efi_runtime_lock, flags); - - return status; + return EFI_UNSUPPORTED; } static void @@ -886,8 +836,10 @@ efi_status_t __init __no_sanitize_address efi_set_virtual_address_map(unsigned long memory_map_size, unsigned long descriptor_size, u32 descriptor_version, - efi_memory_desc_t *virtual_map) + efi_memory_desc_t *virtual_map, + unsigned long systab_phys) { + const efi_system_table_t *systab = (efi_system_table_t *)systab_phys; efi_status_t status; unsigned long flags; pgd_t *save_pgd = NULL; @@ -910,13 +862,16 @@ efi_set_virtual_address_map(unsigned long memory_map_size, /* Disable interrupts around EFI calls: */ local_irq_save(flags); - status = efi_call(efi.systab->runtime->set_virtual_address_map, + status = efi_call(efi.runtime->set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); local_irq_restore(flags); kernel_fpu_end(); + /* grab the virtually remapped EFI runtime services table pointer */ + efi.runtime = READ_ONCE(systab->runtime); + if (save_pgd) efi_uv1_memmap_phys_epilog(save_pgd); else diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index 75c46e7a809f..09ec84f6ef51 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -8,14 +8,20 @@ #include <linux/linkage.h> #include <linux/init.h> +#include <asm/asm-offsets.h> #include <asm/page_types.h> __INIT SYM_FUNC_START(efi_call_svam) - push 8(%esp) - push 8(%esp) + push %ebp + movl %esp, %ebp + push %ebx + + push 16(%esp) + push 16(%esp) push %ecx push %edx + movl %eax, %ebx // &systab_phys->runtime /* * Switch to the flat mapped alias of this routine, by jumping to the @@ -35,15 +41,20 @@ SYM_FUNC_START(efi_call_svam) subl $__PAGE_OFFSET, %esp /* call the EFI routine */ - call *(%eax) + movl (%eax), %eax + call *EFI_svam(%eax) - /* convert ESP back to a kernel VA, and pop the outgoing args */ - addl $__PAGE_OFFSET + 16, %esp + /* grab the virtually remapped EFI runtime services table pointer */ + movl (%ebx), %ecx + movl 36(%esp), %edx // &efi.runtime + movl %ecx, (%edx) /* re-enable paging */ movl %cr0, %edx orl $0x80000000, %edx movl %edx, %cr0 + movl 16(%esp), %ebx + leave ret SYM_FUNC_END(efi_call_svam) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 88d32c06cffa..a5a469cdf5bf 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -410,6 +410,10 @@ void __init efi_free_boot_services(void) int num_entries = 0; void *new, *new_md; + /* Keep all regions for /sys/kernel/debug/efi */ + if (efi_enabled(EFI_DBG)) + return; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -537,7 +541,7 @@ int __init efi_reuse_config(u64 tables, int nr_tables) goto out_memremap; } - for (i = 0; i < efi.systab->nr_tables; i++) { + for (i = 0; i < nr_tables; i++) { efi_guid_t guid; guid = ((efi_config_table_64_t *)p)->guid; @@ -659,12 +663,9 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, return 1; } -#define ICPU(family, model, quirk_handler) \ - { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \ - (unsigned long)&quirk_handler } - static const struct x86_cpu_id efi_capsule_quirk_ids[] = { - ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */ + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 5, INTEL_FAM5_QUARK_X1000, + &qrk_capsule_setup_info), { } }; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index e3f4bfc08f78..31dda18bb370 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,11 +60,8 @@ static struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; -#define ICPU(model, ddata) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } - static const struct x86_cpu_id bt_sfi_cpu_ids[] = { - ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &tng_bt_sfi_data), {} }; diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index e9d97d52475e..0286fe1b14b5 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -569,7 +569,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) } static const struct x86_cpu_id imr_ids[] __initconst = { - { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 5, INTEL_FAM5_QUARK_X1000, NULL), {} }; diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 4307830e1b6f..570e3062faac 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -105,7 +105,7 @@ static void __init imr_self_test(void) } static const struct x86_cpu_id imr_ids[] __initconst = { - { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 5, INTEL_FAM5_QUARK_X1000, NULL), {} }; diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 9e2444500428..526f70f27c1c 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -265,7 +265,7 @@ static void iosf_mbi_reset_semaphore(void) iosf_mbi_sem_address, 0, PUNIT_SEMAPHORE_BIT)) dev_err(&mbi_pdev->dev, "Error P-Unit semaphore reset failed\n"); - pm_qos_update_request(&iosf_mbi_pm_qos, PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_update_request(&iosf_mbi_pm_qos, PM_QOS_DEFAULT_VALUE); blocking_notifier_call_chain(&iosf_mbi_pmic_bus_access_notifier, MBI_PMIC_BUS_ACCESS_END, NULL); @@ -301,8 +301,8 @@ static void iosf_mbi_reset_semaphore(void) * 4) When CPU cores enter C6 or C7 the P-Unit needs to talk to the PMIC * if this happens while the kernel itself is accessing the PMIC I2C bus * the SoC hangs. - * As the third step we call pm_qos_update_request() to disallow the CPU - * to enter C6 or C7. + * As the third step we call cpu_latency_qos_update_request() to disallow the + * CPU to enter C6 or C7. * * 5) The P-Unit has a PMIC bus semaphore which we can request to stop * autonomous P-Unit tasks from accessing the PMIC I2C bus while we hold it. @@ -338,7 +338,7 @@ int iosf_mbi_block_punit_i2c_access(void) * requires the P-Unit to talk to the PMIC and if this happens while * we're holding the semaphore, the SoC hangs. */ - pm_qos_update_request(&iosf_mbi_pm_qos, 0); + cpu_latency_qos_update_request(&iosf_mbi_pm_qos, 0); /* host driver writes to side band semaphore register */ ret = iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_REG_WRITE, @@ -547,8 +547,7 @@ static int __init iosf_mbi_init(void) { iosf_debugfs_init(); - pm_qos_add_request(&iosf_mbi_pm_qos, PM_QOS_CPU_DMA_LATENCY, - PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_add_request(&iosf_mbi_pm_qos, PM_QOS_DEFAULT_VALUE); return pci_register_driver(&iosf_mbi_pci_driver); } @@ -561,7 +560,7 @@ static void __exit iosf_mbi_exit(void) pci_dev_put(mbi_pdev); mbi_pdev = NULL; - pm_qos_remove_request(&iosf_mbi_pm_qos); + cpu_latency_qos_remove_request(&iosf_mbi_pm_qos); } module_init(iosf_mbi_init); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 915bb1639763..aaff9ed7ff45 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -475,20 +475,8 @@ static int msr_save_cpuid_features(const struct x86_cpu_id *c) } static const struct x86_cpu_id msr_save_cpu_table[] = { - { - .vendor = X86_VENDOR_AMD, - .family = 0x15, - .model = X86_MODEL_ANY, - .feature = X86_FEATURE_ANY, - .driver_data = (kernel_ulong_t)msr_save_cpuid_features, - }, - { - .vendor = X86_VENDOR_AMD, - .family = 0x16, - .model = X86_MODEL_ANY, - .feature = X86_FEATURE_ANY, - .driver_data = (kernel_ulong_t)msr_save_cpuid_features, - }, + X86_MATCH_VENDOR_FAM(AMD, 0x15, &msr_save_cpuid_features), + X86_MATCH_VENDOR_FAM(AMD, 0x16, &msr_save_cpuid_features), {} }; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 99b6332ba540..b11ec5d8f8ac 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -71,5 +71,6 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables GCOV_PROFILE := n UBSAN_SANITIZE := n diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 64d135d1ee63..63aa51875ba0 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -71,7 +71,6 @@ SECTIONS /DISCARD/ : { *(.note*) *(.debug*) - *(.eh_frame*) } #include "pasyms.h" diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index a8985e1f7432..95d26a69088b 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -27,11 +27,6 @@ config X86_64 def_bool 64BIT select MODULES_USE_ELF_RELA -config ARCH_DEFCONFIG - string - default "arch/um/configs/i386_defconfig" if X86_32 - default "arch/um/configs/x86_64_defconfig" if X86_64 - config 3_LEVEL_PGTABLES bool "Three-level pagetables" if !64BIT default 64BIT diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 33c51c064c77..77f70b969d14 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -21,6 +21,7 @@ obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o +subarch-y += ../kernel/sys_ia32.o else diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 9649b5ad2ca2..2ed81e581755 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,7 +7,7 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> -#include <generated/user_constants.h> +#include <asm/unistd.h> #include <asm/syscall.h> #define __NO_STUBS @@ -26,11 +26,11 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include <asm/syscalls_32.h> #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym, +#define __SYSCALL_I386(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index c8bc7fb8cbd6..2e8544dafbb0 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,7 +7,7 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> -#include <generated/user_constants.h> +#include <asm/unistd.h> #include <asm/syscall.h> #define __NO_STUBS @@ -36,11 +36,14 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_X32(nr, sym) +#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) + +#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include <asm/syscalls_64.h> #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym, +#define __SYSCALL_64(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 5b37b7f952dd..c51dd8363d25 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -9,18 +9,6 @@ #include <linux/ptrace.h> #include <asm/types.h> -#ifdef __i386__ -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls[] = { -#include <asm/syscalls_32.h> -}; -#else -#define __SYSCALL_64(nr, sym, qual) [nr] = 1, -static char syscalls[] = { -#include <asm/syscalls_64.h> -}; -#endif - #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -94,7 +82,4 @@ void foo(void) DEFINE(UM_PROT_READ, PROT_READ); DEFINE(UM_PROT_WRITE, PROT_WRITE); DEFINE(UM_PROT_EXEC, PROT_EXEC); - - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); - DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 1f756ffffe8b..507f4fb88fa7 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -72,6 +72,9 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> +#ifdef CONFIG_X86_IOPL_IOPERM +#include <asm/io_bitmap.h> +#endif #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -837,6 +840,25 @@ static void xen_load_sp0(unsigned long sp0) this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } +#ifdef CONFIG_X86_IOPL_IOPERM +static void xen_update_io_bitmap(void) +{ + struct physdev_set_iobitmap iobitmap; + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + + native_tss_update_io_bitmap(); + + iobitmap.bitmap = (uint8_t *)(&tss->x86_tss) + + tss->x86_tss.io_bitmap_base; + if (tss->x86_tss.io_bitmap_base == IO_BITMAP_OFFSET_INVALID) + iobitmap.nr_ports = 0; + else + iobitmap.nr_ports = IO_BITMAP_BITS; + + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap); +} +#endif + static void xen_io_delay(void) { } @@ -896,14 +918,15 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; +#ifdef CONFIG_X86_64 + unsigned int which; + u64 base; +#endif ret = 0; switch (msr) { #ifdef CONFIG_X86_64 - unsigned which; - u64 base; - case MSR_FS_BASE: which = SEGBASE_FS; goto set; case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; @@ -1046,6 +1069,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_idt_entry = xen_write_idt_entry, .load_sp0 = xen_load_sp0, +#ifdef CONFIG_X86_IOPL_IOPERM + .update_io_bitmap = xen_update_io_bitmap, +#endif .io_delay = xen_io_delay, /* Xen takes care of %gs when switching to usermode for us */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7a43b2ae19f1..2097fa0ebdb5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -132,7 +132,7 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) continue; - rc = cpu_down(cpu); + rc = remove_cpu(cpu); if (rc == 0) { /* diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index befbdd8b17f0..c8897aad13cd 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -145,12 +145,19 @@ static struct notifier_block xen_pvclock_gtod_notifier = { .notifier_call = xen_pvclock_gtod_notify, }; +static int xen_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); + return 0; +} + static struct clocksource xen_clocksource __read_mostly = { - .name = "xen", - .rating = 400, - .read = xen_clocksource_get_cycles, - .mask = ~0, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "xen", + .rating = 400, + .read = xen_clocksource_get_cycles, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .enable = xen_cs_enable, }; /* @@ -412,12 +419,13 @@ void xen_restore_time_memory_area(void) ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); /* - * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the - * secondary time info with Xen or if we migrated to a host without the - * necessary flags. On both of these cases what happens is either - * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT - * bit set. Userspace checks the latter and if 0, it discards the data - * in pvti and fallbacks to a system call for a reliable timestamp. + * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to + * register the secondary time info with Xen or if we migrated to a + * host without the necessary flags. On both of these cases what + * happens is either process seeing a zeroed out pvti or seeing no + * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and + * if 0, it discards the data in pvti and fallbacks to a system + * call for a reliable timestamp. */ if (ret != 0) pr_notice("Cannot restore secondary vcpu_time_info (err %d)", @@ -443,7 +451,7 @@ static void xen_setup_vsyscall_time_info(void) ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); if (ret) { - pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret); + pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret); free_page((unsigned long)ti); return; } @@ -460,14 +468,14 @@ static void xen_setup_vsyscall_time_info(void) if (!ret) free_page((unsigned long)ti); - pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n"); + pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n"); return; } xen_clock = ti; pvclock_set_pvti_cpu0_va(xen_clock); - xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK; + xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; } static void __init xen_time_init(void) |