diff options
Diffstat (limited to 'arch/x86')
329 files changed, 12424 insertions, 11358 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a294ee..cd18994a9555 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -98,7 +98,6 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL - select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP @@ -106,6 +105,7 @@ config X86 select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT + select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE @@ -127,7 +127,7 @@ config X86 select HAVE_EBPF_JIT if X86_64 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EXIT_THREAD - select HAVE_FENTRY if X86_64 + select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER @@ -160,6 +160,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -290,6 +291,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KASAN_SHADOW_OFFSET hex depends on KASAN + default 0xdff8000000000000 if X86_5LEVEL default 0xdffffc0000000000 config HAVE_INTEL_TXT @@ -1043,6 +1045,14 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. +config X86_MCELOG_LEGACY + bool "Support for deprecated /dev/mcelog character device" + depends on X86_MCE + ---help--- + Enable support for /dev/mcelog which is needed by the old mcelog + userspace logging daemon. Consider switching to the new generation + rasdaemon solution. + config X86_MCE_INTEL def_bool y prompt "Intel MCE features" @@ -1072,7 +1082,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. @@ -1966,7 +1976,7 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE - default n + default y ---help--- In support of Kernel Address Space Layout Randomization (KASLR), this randomizes the physical address at which the kernel image @@ -1996,7 +2006,7 @@ config RANDOMIZE_BASE theoretically possible, but the implementations are further limited due to memory layouts. - If unsure, say N. + If unsure, say Y. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS @@ -2045,7 +2055,7 @@ config RANDOMIZE_MEMORY configuration have in average 30,000 different possible virtual addresses for each memory section. - If unsure, say N. + If unsure, say Y. config RANDOMIZE_MEMORY_PHYSICAL_PADDING hex "Physical memory mapping padding" if EXPERT diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 63c1d13aaf9f..fcb7604172ce 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,6 +5,9 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" +config EARLY_PRINTK_USB + bool + config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y @@ -23,19 +26,20 @@ config EARLY_PRINTK This is useful for kernel debugging when your machine crashes very early before the console code is initialized. For normal operation it is not recommended because it looks ugly and doesn't cooperate - with klogd/syslogd or the X server. You should normally N here, + with klogd/syslogd or the X server. You should normally say N here, unless you want to debug such a crash. config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" depends on EARLY_PRINTK && PCI + select EARLY_PRINTK_USB ---help--- Write kernel log output directly into the EHCI debug port. This is useful for kernel debugging when your machine crashes very early before the console code is initialized. For normal operation it is not recommended because it looks ugly and doesn't cooperate - with klogd/syslogd or the X server. You should normally N here, + with klogd/syslogd or the X server. You should normally say N here, unless you want to debug such a crash. You need usb debug device. config EARLY_PRINTK_EFI @@ -48,6 +52,25 @@ config EARLY_PRINTK_EFI This is useful for kernel debugging when your machine crashes very early before the console code is initialized. +config EARLY_PRINTK_USB_XDBC + bool "Early printk via the xHCI debug port" + depends on EARLY_PRINTK && PCI + select EARLY_PRINTK_USB + ---help--- + Write kernel log output directly into the xHCI debug port. + + One use for this feature is kernel debugging, for example when your + machine crashes very early before the regular console code is + initialized. Other uses include simpler, lockless logging instead of + a full-blown printk console driver + klogd. + + For normal production environments this is normally not recommended, + because it doesn't feed events into klogd/syslogd and doesn't try to + print anything on the screen. + + You should normally say N here, unless you want to debug early + crashes or need a very simple printk logging facility. + config X86_PTDUMP_CORE def_bool n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d449337a360..4430dd489620 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -88,10 +88,10 @@ else KBUILD_CFLAGS += -m64 # Align jump targets to 1 byte, not the default 16 bytes: - KBUILD_CFLAGS += -falign-jumps=1 + KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1) # Pack loops tightly as well: - KBUILD_CFLAGS += -falign-loops=1 + KBUILD_CFLAGS += $(call cc-option,-falign-loops=1) # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) @@ -120,10 +120,6 @@ else # -funit-at-a-time shrinks the kernel .text considerably # unfortunately it makes reading oopses harder. KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) - - # this works around some issues with generating unwind tables in older gccs - # newer gccs do it by default - KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) endif ifdef CONFIG_X86_X32 @@ -147,6 +143,45 @@ ifeq ($(CONFIG_KMEMCHECK),y) KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) endif +# +# If the function graph tracer is used with mcount instead of fentry, +# '-maccumulate-outgoing-args' is needed to prevent a GCC bug +# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42109) +# +ifdef CONFIG_FUNCTION_GRAPH_TRACER + ifndef CONFIG_HAVE_FENTRY + ACCUMULATE_OUTGOING_ARGS := 1 + else + ifeq ($(call cc-option-yn, -mfentry), n) + ACCUMULATE_OUTGOING_ARGS := 1 + + # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'. + # If '-Os' is enabled, disable it and print a warning. + ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + undefine CONFIG_CC_OPTIMIZE_FOR_SIZE + $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.) + endif + + endif + endif +endif + +# +# Jump labels need '-maccumulate-outgoing-args' for gcc < 4.5.2 to prevent a +# GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226). There's no way +# to test for this bug at compile-time because the test case needs to execute, +# which is a no-go for cross compilers. So check the GCC version instead. +# +ifdef CONFIG_JUMP_LABEL + ifneq ($(ACCUMULATE_OUTGOING_ARGS), 1) + ACCUMULATE_OUTGOING_ARGS = $(call cc-if-fullversion, -lt, 040502, 1) + endif +endif + +ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) + KBUILD_CFLAGS += -maccumulate-outgoing-args +endif + # Stackpointer is addressed different for 32 bit and 64 bit x86 sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 6647ed49c66c..a45eb15b7cf2 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -45,24 +45,6 @@ cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) -# Work around the pentium-mmx code generator madness of gcc4.4.x which -# does stack alignment by generating horrible code _before_ the mcount -# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph -# tracer assumptions. For i686, generic, core2 this is set by the -# compiler anyway -ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y) -ADD_ACCUMULATE_OUTGOING_ARGS := y -endif - -# Work around to a bug with asm goto with first implementations of it -# in gcc causing gcc to mess up the push and pop of the stack in some -# uses of asm goto. -ifeq ($(CONFIG_JUMP_LABEL), y) -ADD_ACCUMULATE_OUTGOING_ARGS := y -endif - -cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args) - # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 9b42b6d1e902..ef5a9cc66fb8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -16,7 +16,7 @@ #ifndef BOOT_BOOT_H #define BOOT_BOOT_H -#define STACK_SIZE 512 /* Minimum number of bytes for stack */ +#define STACK_SIZE 1024 /* Minimum number of bytes for stack */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 801c7a158e55..cbf4b87f55b9 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -9,7 +9,9 @@ #include <linux/efi.h> #include <linux/pci.h> + #include <asm/efi.h> +#include <asm/e820/types.h> #include <asm/setup.h> #include <asm/desc.h> @@ -729,7 +731,7 @@ static void add_e820ext(struct boot_params *params, unsigned long size; e820ext->type = SETUP_E820_EXT; - e820ext->len = nr_entries * sizeof(struct e820entry); + e820ext->len = nr_entries * sizeof(struct boot_e820_entry); e820ext->next = 0; data = (struct setup_data *)(unsigned long)params->hdr.setup_data; @@ -746,9 +748,9 @@ static void add_e820ext(struct boot_params *params, static efi_status_t setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size) { - struct e820entry *e820_map = ¶ms->e820_map[0]; + struct boot_e820_entry *entry = params->e820_table; struct efi_info *efi = ¶ms->efi_info; - struct e820entry *prev = NULL; + struct boot_e820_entry *prev = NULL; u32 nr_entries; u32 nr_desc; int i; @@ -773,15 +775,15 @@ static efi_status_t setup_e820(struct boot_params *params, case EFI_MEMORY_MAPPED_IO: case EFI_MEMORY_MAPPED_IO_PORT_SPACE: case EFI_PAL_CODE: - e820_type = E820_RESERVED; + e820_type = E820_TYPE_RESERVED; break; case EFI_UNUSABLE_MEMORY: - e820_type = E820_UNUSABLE; + e820_type = E820_TYPE_UNUSABLE; break; case EFI_ACPI_RECLAIM_MEMORY: - e820_type = E820_ACPI; + e820_type = E820_TYPE_ACPI; break; case EFI_LOADER_CODE: @@ -789,15 +791,15 @@ static efi_status_t setup_e820(struct boot_params *params, case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: - e820_type = E820_RAM; + e820_type = E820_TYPE_RAM; break; case EFI_ACPI_MEMORY_NVS: - e820_type = E820_NVS; + e820_type = E820_TYPE_NVS; break; case EFI_PERSISTENT_MEMORY: - e820_type = E820_PMEM; + e820_type = E820_TYPE_PMEM; break; default: @@ -811,26 +813,26 @@ static efi_status_t setup_e820(struct boot_params *params, continue; } - if (nr_entries == ARRAY_SIZE(params->e820_map)) { - u32 need = (nr_desc - i) * sizeof(struct e820entry) + + if (nr_entries == ARRAY_SIZE(params->e820_table)) { + u32 need = (nr_desc - i) * sizeof(struct e820_entry) + sizeof(struct setup_data); if (!e820ext || e820ext_size < need) return EFI_BUFFER_TOO_SMALL; /* boot_params map full, switch to e820 extended */ - e820_map = (struct e820entry *)e820ext->data; + entry = (struct boot_e820_entry *)e820ext->data; } - e820_map->addr = d->phys_addr; - e820_map->size = d->num_pages << PAGE_SHIFT; - e820_map->type = e820_type; - prev = e820_map++; + entry->addr = d->phys_addr; + entry->size = d->num_pages << PAGE_SHIFT; + entry->type = e820_type; + prev = entry++; nr_entries++; } - if (nr_entries > ARRAY_SIZE(params->e820_map)) { - u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_map); + if (nr_entries > ARRAY_SIZE(params->e820_table)) { + u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_table); add_e820ext(params, e820ext, nr_e820ext); nr_entries -= nr_e820ext; @@ -848,7 +850,7 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, unsigned long size; size = sizeof(struct setup_data) + - sizeof(struct e820entry) * nr_desc; + sizeof(struct e820_entry) * nr_desc; if (*e820ext) { efi_call_early(free_pool, *e820ext); @@ -884,9 +886,9 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, if (first) { nr_desc = *map->buff_size / *map->desc_size; - if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) { + if (nr_desc > ARRAY_SIZE(p->boot_params->e820_table)) { u32 nr_e820ext = nr_desc - - ARRAY_SIZE(p->boot_params->e820_map); + ARRAY_SIZE(p->boot_params->e820_table); status = alloc_e820ext(nr_e820ext, &p->e820ext, &p->e820ext_size); diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index 6248740b68b5..31922023de49 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -4,6 +4,7 @@ * memcpy() and memmove() are defined for the compressed boot environment. */ #include "misc.h" +#include "error.h" void warn(char *m) { diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8b7c9e75edcb..54c24f0a43d3 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -426,7 +426,7 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct e820entry *entry, +static void process_e820_entry(struct boot_e820_entry *entry, unsigned long minimum, unsigned long image_size) { @@ -435,7 +435,7 @@ static void process_e820_entry(struct e820entry *entry, unsigned long start_orig; /* Skip non-RAM entries. */ - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) return; /* On 32-bit, ignore entries entirely above our maximum. */ @@ -518,7 +518,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum, /* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_map[i], minimum, + process_e820_entry(&boot_params->e820_table[i], minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); @@ -597,10 +597,17 @@ void choose_random_location(unsigned long input, add_identity_map(random_addr, output_size); *output = random_addr; } + + /* + * This loads the identity mapping page table. + * This should only be done if a new physical address + * is found for the kernel, otherwise we should keep + * the old page table to make it be like the "nokaslr" + * case. + */ + finalize_identity_maps(); } - /* This actually loads the identity pagetable on x86_64. */ - finalize_identity_maps(); /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ if (IS_ENABLED(CONFIG_X86_64)) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4ad7d70e8739..8f0c4c9fc904 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] = 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, 0, /* REQUIRED_MASK7 not implemented in this file */ + 0, /* REQUIRED_MASK8 not implemented in this file */ + 0, /* REQUIRED_MASK9 not implemented in this file */ + 0, /* REQUIRED_MASK10 not implemented in this file */ + 0, /* REQUIRED_MASK11 not implemented in this file */ + 0, /* REQUIRED_MASK12 not implemented in this file */ + 0, /* REQUIRED_MASK13 not implemented in this file */ + 0, /* REQUIRED_MASK14 not implemented in this file */ + 0, /* REQUIRED_MASK15 not implemented in this file */ + REQUIRED_MASK16, }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 6687ab953257..9e77c23c2422 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -70,16 +70,19 @@ int has_eflag(unsigned long mask) # define EBX_REG "=b" #endif -static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d) +static inline void cpuid_count(u32 id, u32 count, + u32 *a, u32 *b, u32 *c, u32 *d) { asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" "cpuid \n\t" ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) - : "a" (id) + : "a" (id), "c" (count) ); } +#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d) + void get_cpuflags(void) { u32 max_intel_level, max_amd_level; @@ -108,6 +111,11 @@ void get_cpuflags(void) cpu.model += ((tfms >> 16) & 0xf) << 4; } + if (max_intel_level >= 0x00000007) { + cpuid_count(0x00000007, 0, &ignored, &ignored, + &cpu.flags[16], &ignored); + } + cpuid(0x80000000, &max_amd_level, &ignored, &ignored, &ignored); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 3dd5be33aaa7..2ed8f0c25def 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -18,7 +18,6 @@ #include <asm/segment.h> #include <generated/utsrelease.h> #include <asm/boot.h> -#include <asm/e820.h> #include <asm/page_types.h> #include <asm/setup.h> #include <asm/bootparam.h> diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index db75d07c3645..d9c28c87e477 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -21,8 +21,8 @@ static int detect_memory_e820(void) { int count = 0; struct biosregs ireg, oreg; - struct e820entry *desc = boot_params.e820_map; - static struct e820entry buf; /* static so it is zeroed */ + struct boot_e820_entry *desc = boot_params.e820_table; + static struct boot_e820_entry buf; /* static so it is zeroed */ initregs(&ireg); ireg.ax = 0xe820; @@ -66,7 +66,7 @@ static int detect_memory_e820(void) *desc++ = buf; count++; - } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map)); + } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table)); return boot_params.e820_entries = count; } diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 5fa6ee2c2dde..6cf79e1a6830 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -57,6 +57,8 @@ CONFIG_EFI=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_RANDOMIZE_MEMORY=y # CONFIG_COMPAT_VDSO is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 7ef4a099defc..de45f57b410d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -55,6 +55,8 @@ CONFIG_EFI=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_RANDOMIZE_MEMORY=y # CONFIG_COMPAT_VDSO is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y @@ -176,6 +178,7 @@ CONFIG_E1000E=y CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y +CONFIG_R8169=y CONFIG_FDDI=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index a916c4a61165..5f6a5af9c489 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -65,7 +65,6 @@ #include <linux/linkage.h> #include <asm/inst.h> -#define CONCAT(a,b) a##b #define VMOVDQ vmovdqu #define xdata0 %xmm0 @@ -92,8 +91,6 @@ #define num_bytes %r8 #define tmp %r10 -#define DDQ(i) CONCAT(ddq_add_,i) -#define XMM(i) CONCAT(%xmm, i) #define DDQ_DATA 0 #define XDATA 1 #define KEY_128 1 @@ -131,12 +128,12 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ .macro setddq n - var_ddq_add = DDQ(\n) + var_ddq_add = ddq_add_\n .endm /* generate a unique variable for xmm register */ .macro setxdata n - var_xdata = XMM(\n) + var_xdata = %xmm\n .endm /* club the numeric 'id' to the symbol 'name' */ diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index aa76cad9d262..af4840ab2a3d 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1522,7 +1522,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[2 * 4]; + le128 buf[2 * 4]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), @@ -1540,7 +1540,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[2 * 4]; + le128 buf[2 * 4]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 260a060d7275..24ac9fad832d 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <crypto/b128ops.h> +#include <crypto/gf128mul.h> #include <crypto/internal/skcipher.h> #include <crypto/lrw.h> #include <crypto/xts.h> @@ -457,7 +458,7 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, le128 ivblk = *iv; /* generate next IV */ - le128_gf128mul_x_ble(iv, &ivblk); + gf128mul_x_ble(iv, &ivblk); /* CC <- T xor C */ u128_xor(dst, src, (u128 *)&ivblk); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 644f97ab8cac..ac0e831943f5 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -328,7 +328,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; + le128 buf[SERPENT_PARALLEL_BLOCKS]; struct crypt_priv crypt_ctx = { .ctx = &ctx->crypt_ctx, .fpu_enabled = false, @@ -355,7 +355,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; + le128 buf[SERPENT_PARALLEL_BLOCKS]; struct crypt_priv crypt_ctx = { .ctx = &ctx->crypt_ctx, .fpu_enabled = false, diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 2ebb5e9789f3..243e90a4b5d9 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -296,7 +296,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[3]; + le128 buf[3]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), @@ -314,7 +314,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[3]; + le128 buf[3]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 370c42c7f046..cdefcfdd9e63 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -22,6 +22,7 @@ #include <linux/context_tracking.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> +#include <linux/livepatch.h> #include <asm/desc.h> #include <asm/traps.h> @@ -130,14 +131,13 @@ static long syscall_trace_enter(struct pt_regs *regs) #define EXIT_TO_USERMODE_LOOP_FLAGS \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY) + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) { /* * In order to return to user mode, we need to have IRQs off with - * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, - * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags + * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags * can be set at any time on preemptable kernels if we have IRQs on, * so we need to loop. Disabling preemption wouldn't help: doing the * work to clear some of the flags can sleep. @@ -164,6 +164,9 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); + if (cached_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + /* Disable IRQs and retry */ local_irq_disable(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 57f7ec35216e..50bc26949e9e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -35,16 +35,13 @@ #include <asm/errno.h> #include <asm/segment.h> #include <asm/smp.h> -#include <asm/page_types.h> #include <asm/percpu.h> #include <asm/processor-flags.h> -#include <asm/ftrace.h> #include <asm/irq_vectors.h> #include <asm/cpufeatures.h> #include <asm/alternative-asm.h> #include <asm/asm.h> #include <asm/smap.h> -#include <asm/export.h> #include <asm/frame.h> .section .entry.text, "ax" @@ -585,7 +582,7 @@ ENTRY(iret_exc ) * will soon execute iret and the tracer was already set to * the irqstate after the IRET: */ - DISABLE_INTERRUPTS(CLBR_EAX) + DISABLE_INTERRUPTS(CLBR_ANY) lss (%esp), %esp /* switch to espfix segment */ jmp .Lrestore_nocheck #endif @@ -886,172 +883,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4, %esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -.Lftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -/* This is weak to keep gas from relaxing the jumps */ -WEAK(ftrace_stub) - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS, 13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp .Lftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz .Ltrace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -.Ltrace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -EXPORT_SYMBOL(mcount) -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - #ifdef CONFIG_TRACING ENTRY(trace_page_fault) ASM_CLAC diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 044d18ebc43c..607d72c4a485 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -212,7 +212,7 @@ entry_SYSCALL_64_fastpath: * If we see that no exit work is required (which we are required * to check with IRQs off), then we can go straight to SYSRET64. */ - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movq PER_CPU_VAR(current_task), %r11 testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) @@ -233,7 +233,7 @@ entry_SYSCALL_64_fastpath: * raise(3) will trigger this, for example. IRQs are off. */ TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) + ENABLE_INTERRUPTS(CLBR_ANY) SAVE_EXTRA_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ @@ -265,12 +265,9 @@ return_from_SYSCALL_64: * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. + * + * Change top 16 bits to be the sign-extension of 47th bit */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - - /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx @@ -343,7 +340,7 @@ ENTRY(stub_ptregs_64) * Called from fast path -- disable IRQs again, pop return address * and jump to slow path */ - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax jmp entry_SYSCALL64_slow_path @@ -518,7 +515,7 @@ common_interrupt: interrupt do_IRQ /* 0(%rsp): old RSP */ ret_from_intr: - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) @@ -1051,7 +1048,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs @@ -1156,10 +1153,9 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) - movl %ebx, %eax - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %eax, %eax + testl %ebx, %ebx jnz retint_kernel jmp retint_user END(error_exit) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9ba050fe47f3..448ac2161112 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -226,7 +226,7 @@ 217 i386 pivot_root sys_pivot_root 218 i386 mincore sys_mincore 219 i386 madvise sys_madvise -220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +220 i386 getdents64 sys_getdents64 221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 # 222 is unused # 223 is unused @@ -390,3 +390,4 @@ 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx +384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 9d4d6e138311..fa8dbfcf7ed3 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -17,6 +17,7 @@ #include <asm/unistd.h> #include <asm/msr.h> #include <asm/pvclock.h> +#include <asm/mshyperv.h> #include <linux/math64.h> #include <linux/time.h> #include <linux/kernel.h> @@ -32,6 +33,11 @@ extern u8 pvclock_page __attribute__((visibility("hidden"))); #endif +#ifdef CONFIG_HYPERV_TSCPAGE +extern u8 hvclock_page + __attribute__((visibility("hidden"))); +#endif + #ifndef BUILD_VDSO32 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) @@ -141,6 +147,20 @@ static notrace u64 vread_pvclock(int *mode) return last; } #endif +#ifdef CONFIG_HYPERV_TSCPAGE +static notrace u64 vread_hvclock(int *mode) +{ + const struct ms_hyperv_tsc_page *tsc_pg = + (const struct ms_hyperv_tsc_page *)&hvclock_page; + u64 current_tick = hv_read_tsc_page(tsc_pg); + + if (current_tick != U64_MAX) + return current_tick; + + *mode = VCLOCK_NONE; + return 0; +} +#endif notrace static u64 vread_tsc(void) { @@ -173,6 +193,10 @@ notrace static inline u64 vgetsns(int *mode) else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); #endif +#ifdef CONFIG_HYPERV_TSCPAGE + else if (gtod->vclock_mode == VCLOCK_HVCLOCK) + cycles = vread_hvclock(mode); +#endif else return 0; v = (cycles - gtod->cycle_last) & gtod->mask; diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index a708aa90b507..8ebb4b6454fe 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 2 * PAGE_SIZE; + vvar_start = . - 3 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -36,6 +36,7 @@ SECTIONS #undef EMIT_VVAR pvclock_page = vvar_start + PAGE_SIZE; + hvclock_page = vvar_start + 2 * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 491020b2826d..0780a443a53b 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -74,6 +74,7 @@ enum { sym_vvar_page, sym_hpet_page, sym_pvclock_page, + sym_hvclock_page, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -82,6 +83,7 @@ const int special_pages[] = { sym_vvar_page, sym_hpet_page, sym_pvclock_page, + sym_hvclock_page, }; struct vdso_sym { @@ -94,6 +96,7 @@ struct vdso_sym required_syms[] = { [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, [sym_pvclock_page] = {"pvclock_page", true}, + [sym_hvclock_page] = {"hvclock_page", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 7853b53959cd..3f9d1a83891a 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -30,8 +30,10 @@ static int __init vdso32_setup(char *s) { vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso32_enabled > 1) + if (vdso32_enabled > 1) { pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + vdso32_enabled = 0; + } return 1; } @@ -62,13 +64,18 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> +static const int zero; +static const int one = 1; + static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&zero, + .extra2 = (int *)&one, }, {} }; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 226ca70dc6bd..139ad7726e10 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -22,6 +22,7 @@ #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> +#include <asm/mshyperv.h> #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; @@ -121,6 +122,12 @@ static int vvar_fault(const struct vm_special_mapping *sm, vmf->address, __pa(pvti) >> PAGE_SHIFT); } + } else if (sym_offset == image->sym_hvclock_page) { + struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); + + if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) + ret = vm_insert_pfn(vma, vmf->address, + vmalloc_to_pfn(tsc_pg)); } if (ret == 0 || ret == -EBUSY) @@ -354,7 +361,7 @@ static void vgetcpu_cpu_init(void *arg) d.p = 1; /* Present */ d.d = 1; /* 32-bit */ - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } static int vgetcpu_online(unsigned int cpu) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index afb222b63cae..c84584bb9402 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -604,7 +604,7 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, return &amd_f15_PMC20; } case AMD_EVENT_NB: - /* moved to perf_event_amd_uncore.c */ + /* moved to uncore.c */ return &emptyconstraint; default: return &emptyconstraint; diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index b28200dea715..3641e24fdac5 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -11,6 +11,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) "perf/amd_iommu: " fmt + #include <linux/perf_event.h> #include <linux/init.h> #include <linux/cpumask.h> @@ -21,44 +23,42 @@ #define COUNTER_SHIFT 16 -#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8)) -#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) +/* iommu pmu conf masks */ +#define GET_CSOURCE(x) ((x)->conf & 0xFFULL) +#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL) +#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL) +#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL) -/* iommu pmu config masks */ -#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL)) -#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL) -#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL) -#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL) -#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL) -#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL) -#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL) +/* iommu pmu conf1 masks */ +#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL) +#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) +#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) -static struct perf_amd_iommu __perf_iommu; +#define IOMMU_NAME_SIZE 16 struct perf_amd_iommu { + struct list_head list; struct pmu pmu; + struct amd_iommu *iommu; + char name[IOMMU_NAME_SIZE]; u8 max_banks; u8 max_counters; u64 cntr_assign_mask; raw_spinlock_t lock; - const struct attribute_group *attr_groups[4]; }; -#define format_group attr_groups[0] -#define cpumask_group attr_groups[1] -#define events_group attr_groups[2] -#define null_group attr_groups[3] +static LIST_HEAD(perf_amd_iommu_list); /*--------------------------------------------- * sysfs format attributes *---------------------------------------------*/ PMU_FORMAT_ATTR(csource, "config:0-7"); PMU_FORMAT_ATTR(devid, "config:8-23"); -PMU_FORMAT_ATTR(pasid, "config:24-39"); -PMU_FORMAT_ATTR(domid, "config:40-55"); +PMU_FORMAT_ATTR(domid, "config:24-39"); +PMU_FORMAT_ATTR(pasid, "config:40-59"); PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); -PMU_FORMAT_ATTR(pasid_mask, "config1:16-31"); -PMU_FORMAT_ATTR(domid_mask, "config1:32-47"); +PMU_FORMAT_ATTR(domid_mask, "config1:16-31"); +PMU_FORMAT_ATTR(pasid_mask, "config1:32-51"); static struct attribute *iommu_format_attrs[] = { &format_attr_csource.attr, @@ -79,6 +79,10 @@ static struct attribute_group amd_iommu_format_group = { /*--------------------------------------------- * sysfs events attributes *---------------------------------------------*/ +static struct attribute_group amd_iommu_events_group = { + .name = "events", +}; + struct amd_iommu_event_desc { struct kobj_attribute attr; const char *event; @@ -150,30 +154,34 @@ static struct attribute_group amd_iommu_cpumask_group = { /*---------------------------------------------*/ -static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu) +static int get_next_avail_iommu_bnk_cntr(struct perf_event *event) { + struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu); + int max_cntrs = piommu->max_counters; + int max_banks = piommu->max_banks; + u32 shift, bank, cntr; unsigned long flags; - int shift, bank, cntr, retval; - int max_banks = perf_iommu->max_banks; - int max_cntrs = perf_iommu->max_counters; + int retval; - raw_spin_lock_irqsave(&perf_iommu->lock, flags); + raw_spin_lock_irqsave(&piommu->lock, flags); for (bank = 0, shift = 0; bank < max_banks; bank++) { for (cntr = 0; cntr < max_cntrs; cntr++) { shift = bank + (bank*3) + cntr; - if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) { + if (piommu->cntr_assign_mask & BIT_ULL(shift)) { continue; } else { - perf_iommu->cntr_assign_mask |= (1ULL<<shift); - retval = ((u16)((u16)bank<<8) | (u8)(cntr)); + piommu->cntr_assign_mask |= BIT_ULL(shift); + event->hw.iommu_bank = bank; + event->hw.iommu_cntr = cntr; + retval = 0; goto out; } } } retval = -ENOSPC; out: - raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + raw_spin_unlock_irqrestore(&piommu->lock, flags); return retval; } @@ -202,8 +210,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, static int perf_iommu_event_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct perf_amd_iommu *perf_iommu; - u64 config, config1; /* test the event attr type check for PMU enumeration */ if (event->attr.type != event->pmu->type) @@ -225,80 +231,62 @@ static int perf_iommu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - perf_iommu = &__perf_iommu; - - if (event->pmu != &perf_iommu->pmu) - return -ENOENT; - - if (perf_iommu) { - config = event->attr.config; - config1 = event->attr.config1; - } else { - return -EINVAL; - } - - /* integrate with iommu base devid (0000), assume one iommu */ - perf_iommu->max_banks = - amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID); - perf_iommu->max_counters = - amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID); - if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0)) - return -EINVAL; - /* update the hw_perf_event struct with the iommu config data */ - hwc->config = config; - hwc->extra_reg.config = config1; + hwc->conf = event->attr.config; + hwc->conf1 = event->attr.config1; return 0; } +static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev) +{ + return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu; +} + static void perf_iommu_enable_event(struct perf_event *ev) { - u8 csource = _GET_CSOURCE(ev); - u16 devid = _GET_DEVID(ev); + struct amd_iommu *iommu = perf_event_2_iommu(ev); + struct hw_perf_event *hwc = &ev->hw; + u8 bank = hwc->iommu_bank; + u8 cntr = hwc->iommu_cntr; u64 reg = 0ULL; - reg = csource; - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_COUNTER_SRC_REG, ®, true); + reg = GET_CSOURCE(hwc); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, ®); - reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32); + reg = GET_DEVID_MASK(hwc); + reg = GET_DEVID(hwc) | (reg << 32); if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_DEVID_MATCH_REG, ®, true); + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, ®); - reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32); + reg = GET_PASID_MASK(hwc); + reg = GET_PASID(hwc) | (reg << 32); if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_PASID_MATCH_REG, ®, true); + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, ®); - reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32); + reg = GET_DOMID_MASK(hwc); + reg = GET_DOMID(hwc) | (reg << 32); if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_DOMID_MATCH_REG, ®, true); + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, ®); } static void perf_iommu_disable_event(struct perf_event *event) { + struct amd_iommu *iommu = perf_event_2_iommu(event); + struct hw_perf_event *hwc = &event->hw; u64 reg = 0ULL; - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_SRC_REG, ®, true); + amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_SRC_REG, ®); } static void perf_iommu_start(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - pr_debug("perf: amd_iommu:perf_iommu_start\n"); if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -306,10 +294,11 @@ static void perf_iommu_start(struct perf_event *event, int flags) hwc->state = 0; if (flags & PERF_EF_RELOAD) { - u64 prev_raw_count = local64_read(&hwc->prev_count); - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_REG, &prev_raw_count, true); + u64 prev_raw_count = local64_read(&hwc->prev_count); + struct amd_iommu *iommu = perf_event_2_iommu(event); + + amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_REG, &prev_raw_count); } perf_iommu_enable_event(event); @@ -319,37 +308,30 @@ static void perf_iommu_start(struct perf_event *event, int flags) static void perf_iommu_read(struct perf_event *event) { - u64 count = 0ULL; - u64 prev_raw_count = 0ULL; - u64 delta = 0ULL; + u64 count, prev, delta; struct hw_perf_event *hwc = &event->hw; - pr_debug("perf: amd_iommu:perf_iommu_read\n"); + struct amd_iommu *iommu = perf_event_2_iommu(event); - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_REG, &count, false); + if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_REG, &count)) + return; /* IOMMU pc counter register is only 48 bits */ - count &= 0xFFFFFFFFFFFFULL; + count &= GENMASK_ULL(47, 0); - prev_raw_count = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - count) != prev_raw_count) + prev = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev) return; - /* Handling 48-bit counter overflowing */ - delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT); + /* Handle 48-bit counter overflow */ + delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); delta >>= COUNTER_SHIFT; local64_add(delta, &event->count); - } static void perf_iommu_stop(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - u64 config; - - pr_debug("perf: amd_iommu:perf_iommu_stop\n"); if (hwc->state & PERF_HES_UPTODATE) return; @@ -361,7 +343,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; - config = hwc->config; perf_iommu_read(event); hwc->state |= PERF_HES_UPTODATE; } @@ -369,17 +350,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags) static int perf_iommu_add(struct perf_event *event, int flags) { int retval; - struct perf_amd_iommu *perf_iommu = - container_of(event->pmu, struct perf_amd_iommu, pmu); - pr_debug("perf: amd_iommu:perf_iommu_add\n"); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; /* request an iommu bank/counter */ - retval = get_next_avail_iommu_bnk_cntr(perf_iommu); - if (retval != -ENOSPC) - event->hw.extra_reg.reg = (u16)retval; - else + retval = get_next_avail_iommu_bnk_cntr(event); + if (retval) return retval; if (flags & PERF_EF_START) @@ -390,115 +366,124 @@ static int perf_iommu_add(struct perf_event *event, int flags) static void perf_iommu_del(struct perf_event *event, int flags) { + struct hw_perf_event *hwc = &event->hw; struct perf_amd_iommu *perf_iommu = container_of(event->pmu, struct perf_amd_iommu, pmu); - pr_debug("perf: amd_iommu:perf_iommu_del\n"); perf_iommu_stop(event, PERF_EF_UPDATE); /* clear the assigned iommu bank/counter */ clear_avail_iommu_bnk_cntr(perf_iommu, - _GET_BANK(event), - _GET_CNTR(event)); + hwc->iommu_bank, hwc->iommu_cntr); perf_event_update_userpage(event); } -static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu) +static __init int _init_events_attrs(void) { - struct attribute **attrs; - struct attribute_group *attr_group; int i = 0, j; + struct attribute **attrs; while (amd_iommu_v2_event_descs[i].attr.attr.name) i++; - attr_group = kzalloc(sizeof(struct attribute *) - * (i + 1) + sizeof(*attr_group), GFP_KERNEL); - if (!attr_group) + attrs = kzalloc(sizeof(struct attribute **) * (i + 1), GFP_KERNEL); + if (!attrs) return -ENOMEM; - attrs = (struct attribute **)(attr_group + 1); for (j = 0; j < i; j++) attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; - attr_group->name = "events"; - attr_group->attrs = attrs; - perf_iommu->events_group = attr_group; - + amd_iommu_events_group.attrs = attrs; return 0; } -static __init void amd_iommu_pc_exit(void) -{ - if (__perf_iommu.events_group != NULL) { - kfree(__perf_iommu.events_group); - __perf_iommu.events_group = NULL; - } -} +const struct attribute_group *amd_iommu_attr_groups[] = { + &amd_iommu_format_group, + &amd_iommu_cpumask_group, + &amd_iommu_events_group, + NULL, +}; + +static struct pmu iommu_pmu = { + .event_init = perf_iommu_event_init, + .add = perf_iommu_add, + .del = perf_iommu_del, + .start = perf_iommu_start, + .stop = perf_iommu_stop, + .read = perf_iommu_read, + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_iommu_attr_groups, +}; -static __init int _init_perf_amd_iommu( - struct perf_amd_iommu *perf_iommu, char *name) +static __init int init_one_iommu(unsigned int idx) { + struct perf_amd_iommu *perf_iommu; int ret; + perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL); + if (!perf_iommu) + return -ENOMEM; + raw_spin_lock_init(&perf_iommu->lock); - /* Init format attributes */ - perf_iommu->format_group = &amd_iommu_format_group; + perf_iommu->pmu = iommu_pmu; + perf_iommu->iommu = get_amd_iommu(idx); + perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx); + perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx); - /* Init cpumask attributes to only core 0 */ - cpumask_set_cpu(0, &iommu_cpumask); - perf_iommu->cpumask_group = &amd_iommu_cpumask_group; - - /* Init events attributes */ - if (_init_events_attrs(perf_iommu) != 0) - pr_err("perf: amd_iommu: Only support raw events.\n"); + if (!perf_iommu->iommu || + !perf_iommu->max_banks || + !perf_iommu->max_counters) { + kfree(perf_iommu); + return -EINVAL; + } - /* Init null attributes */ - perf_iommu->null_group = NULL; - perf_iommu->pmu.attr_groups = perf_iommu->attr_groups; + snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx); - ret = perf_pmu_register(&perf_iommu->pmu, name, -1); - if (ret) { - pr_err("perf: amd_iommu: Failed to initialized.\n"); - amd_iommu_pc_exit(); + ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1); + if (!ret) { + pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n", + idx, perf_iommu->max_banks, perf_iommu->max_counters); + list_add_tail(&perf_iommu->list, &perf_amd_iommu_list); } else { - pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n", - amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID), - amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID)); + pr_warn("Error initializing IOMMU %d.\n", idx); + kfree(perf_iommu); } - return ret; } -static struct perf_amd_iommu __perf_iommu = { - .pmu = { - .task_ctx_nr = perf_invalid_context, - .event_init = perf_iommu_event_init, - .add = perf_iommu_add, - .del = perf_iommu_del, - .start = perf_iommu_start, - .stop = perf_iommu_stop, - .read = perf_iommu_read, - }, - .max_banks = 0x00, - .max_counters = 0x00, - .cntr_assign_mask = 0ULL, - .format_group = NULL, - .cpumask_group = NULL, - .events_group = NULL, - .null_group = NULL, -}; - static __init int amd_iommu_pc_init(void) { + unsigned int i, cnt = 0; + int ret; + /* Make sure the IOMMU PC resource is available */ if (!amd_iommu_pc_supported()) return -ENODEV; - _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); + ret = _init_events_attrs(); + if (ret) + return ret; + + /* + * An IOMMU PMU is specific to an IOMMU, and can function independently. + * So we go through all IOMMUs and ignore the one that fails init + * unless all IOMMU are failing. + */ + for (i = 0; i < amd_iommu_get_num_iommus(); i++) { + ret = init_one_iommu(i); + if (!ret) + cnt++; + } + + if (!cnt) { + kfree(amd_iommu_events_group.attrs); + return -ENODEV; + } + /* Init cpumask attributes to only core 0 */ + cpumask_set_cpu(0, &iommu_cpumask); return 0; } diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 845d173278e3..62e0702c4374 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -24,17 +24,23 @@ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 -/* iommu pc reg masks*/ -#define IOMMU_BASE_DEVID 0x0000 +struct amd_iommu; /* amd_iommu_init.c external support functions */ +extern int amd_iommu_get_num_iommus(void); + extern bool amd_iommu_pc_supported(void); -extern u8 amd_iommu_pc_get_max_banks(u16 devid); +extern u8 amd_iommu_pc_get_max_banks(unsigned int idx); + +extern u8 amd_iommu_pc_get_max_counters(unsigned int idx); + +extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, + u8 fxn, u64 *value); -extern u8 amd_iommu_pc_get_max_counters(u16 devid); +extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, + u8 fxn, u64 *value); -extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, - u8 fxn, u64 *value, bool is_write); +extern struct amd_iommu *get_amd_iommu(int idx); #endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 4d1f7f2d9aff..ad44af0dd667 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -30,6 +30,9 @@ #define COUNTER_SHIFT 16 +#undef pr_fmt +#define pr_fmt(fmt) "amd_uncore: " fmt + static int num_counters_llc; static int num_counters_nb; @@ -509,51 +512,34 @@ static int __init amd_uncore_init(void) int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - goto fail_nodev; - - switch(boot_cpu_data.x86) { - case 23: - /* Family 17h: */ - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L3; - /* - * For Family17h, the NorthBridge counters are - * re-purposed as Data Fabric counters. Also, support is - * added for L3 counters. The pmus are exported based on - * family as either L2 or L3 and NB or DF. - */ - amd_nb_pmu.name = "amd_df"; - amd_llc_pmu.name = "amd_l3"; - format_attr_event_df.show = &event_show_df; - format_attr_event_l3.show = &event_show_l3; - break; - case 22: - /* Family 16h - may change: */ - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - break; - default: - /* - * All prior families have the same number of - * NorthBridge and Last Level Cache counters - */ - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - break; - } - amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; - amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; + return -ENODEV; if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) - goto fail_nodev; + return -ENODEV; + + if (boot_cpu_data.x86 == 0x17) { + /* + * For F17h, the Northbridge counters are repurposed as Data + * Fabric counters. Also, L3 counters are supported too. The PMUs + * are exported based on family as either L2 or L3 and NB or DF. + */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L3; + amd_nb_pmu.name = "amd_df"; + amd_llc_pmu.name = "amd_l3"; + format_attr_event_df.show = &event_show_df; + format_attr_event_l3.show = &event_show_l3; + } else { + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + } + + amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; + amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { amd_uncore_nb = alloc_percpu(struct amd_uncore *); @@ -565,7 +551,7 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("perf: AMD NB counters detected\n"); + pr_info("AMD NB counters detected\n"); ret = 0; } @@ -579,7 +565,7 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("perf: AMD LLC counters detected\n"); + pr_info("AMD LLC counters detected\n"); ret = 0; } @@ -615,7 +601,6 @@ fail_nb: if (amd_uncore_nb) free_percpu(amd_uncore_nb); -fail_nodev: return ret; } device_initcall(amd_uncore_init); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 349d4d17aa7f..580b60f5ac83 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2101,8 +2101,8 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - if (current->mm) - load_mm_cr4(current->mm); + if (current->active_mm) + load_mm_cr4(current->active_mm); } static void x86_pmu_event_mapped(struct perf_event *event) @@ -2110,6 +2110,18 @@ static void x86_pmu_event_mapped(struct perf_event *event) if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; + /* + * This function relies on not being called concurrently in two + * tasks in the same mm. Otherwise one task could observe + * perf_rdpmc_allowed > 1 and return all the way back to + * userspace with CR4.PCE clear while another task is still + * doing on_each_cpu_mask() to propagate CR4.PCE. + * + * For now, this can't happen because all callers hold mmap_sem + * for write. If this changes, we'll need a different solution. + */ + lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); } @@ -2244,6 +2256,7 @@ void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data *data; + u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; @@ -2251,11 +2264,13 @@ void arch_perf_update_userpage(struct perf_event *event, !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable()) + if (!using_native_sched_clock() || !sched_clock_stable()) return; data = cyc2ns_read_begin(); + offset = data->cyc2ns_offset + __sched_clock_offset; + /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. @@ -2263,7 +2278,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 1; userpg->time_mult = data->cyc2ns_mul; userpg->time_shift = data->cyc2ns_shift; - userpg->time_offset = data->cyc2ns_offset - now; + userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different @@ -2271,7 +2286,7 @@ void arch_perf_update_userpage(struct perf_event *event, */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; + userpg->time_zero = offset; } cyc2ns_read_end(data); diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 982c9e31daca..8ae8c5ce3a1f 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -63,7 +63,6 @@ struct bts_buffer { unsigned int cur_buf; bool snapshot; local_t data_size; - local_t lost; local_t head; unsigned long end; void **data_pages; @@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts) return; if (ds->bts_index >= ds->bts_absolute_maximum) - local_inc(&buf->lost); + perf_aux_output_flag(&bts->handle, + PERF_AUX_FLAG_TRUNCATED); /* * old and head are always in the same physical buffer, so we @@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags) return; fail_end_stop: - perf_aux_output_end(&bts->handle, 0, false); + perf_aux_output_end(&bts->handle, 0); fail_stop: event->hw.state = PERF_HES_STOPPED; @@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags) bts->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); - - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); + perf_aux_output_end(&bts->handle, + local_xchg(&buf->data_size, 0)); } cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; @@ -484,8 +483,7 @@ int intel_bts_interrupt(void) if (old_head == local_read(&buf->head)) return handled; - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0)); buf = perf_aux_output_begin(&bts->handle, event); if (buf) @@ -500,7 +498,7 @@ int intel_bts_interrupt(void) * cleared handle::event */ barrier(); - perf_aux_output_end(&bts->handle, 0, false); + perf_aux_output_end(&bts->handle, 0); } } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index eb1484c86bb4..a6d91d4e37a1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1553,6 +1553,27 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c"); +EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3"); +/* UOPS_NOT_DELIVERED.ANY */ +EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c"); +/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */ +EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02"); +/* UOPS_RETIRED.ANY */ +EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2"); +/* UOPS_ISSUED.ANY */ +EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e"); + +static struct attribute *glm_events_attrs[] = { + EVENT_PTR(td_total_slots_glm), + EVENT_PTR(td_total_slots_scale_glm), + EVENT_PTR(td_fetch_bubbles_glm), + EVENT_PTR(td_recovery_bubbles_glm), + EVENT_PTR(td_slots_issued_glm), + EVENT_PTR(td_slots_retired_glm), + NULL +}; + static struct extra_reg intel_glm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0), @@ -2130,7 +2151,7 @@ again: * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - status &= ~cpuc->pebs_enabled; + status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); /* * PEBS overflow sets bit 62 in the global status register @@ -3750,6 +3771,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.cpu_events = glm_events_attrs; pr_cont("Goldmont events, "); break; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index aff4b5b69d40..238ae3248ba5 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -1,5 +1,5 @@ /* - * perf_event_intel_cstate.c: support cstate residency counters + * Support cstate residency counters * * Copyright (C) 2015, Intel Corp. * Author: Kan Liang (kan.liang@intel.com) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9dfeeeca0ea8..c6d23ffe422d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1222,7 +1222,7 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) /* clear non-PEBS bit and re-check */ pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + pebs_status &= PEBS_COUNTER_MASK; if (pebs_status == (1 << bit)) return at; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 81b321ace8e0..f924629836a8 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -507,6 +507,9 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].to = msr_lastbranch.to; cpuc->lbr_entries[i].mispred = 0; cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].in_tx = 0; + cpuc->lbr_entries[i].abort = 0; + cpuc->lbr_entries[i].cycles = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 5900471ee508..ae8324d65e61 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -28,6 +28,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/intel_pt.h> +#include <asm/intel-family.h> #include "../perf_event.h" #include "pt.h" @@ -98,6 +99,7 @@ static struct attribute_group pt_cap_group = { .name = "caps", }; +PMU_FORMAT_ATTR(pt, "config:0" ); PMU_FORMAT_ATTR(cyc, "config:1" ); PMU_FORMAT_ATTR(pwr_evt, "config:4" ); PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); @@ -105,11 +107,13 @@ PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); PMU_FORMAT_ATTR(ptw, "config:12" ); +PMU_FORMAT_ATTR(branch, "config:13" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { + &format_attr_pt.attr, &format_attr_cyc.attr, &format_attr_pwr_evt.attr, &format_attr_fup_on_ptw.attr, @@ -117,6 +121,7 @@ static struct attribute *pt_formats_attr[] = { &format_attr_tsc.attr, &format_attr_noretcomp.attr, &format_attr_ptw.attr, + &format_attr_branch.attr, &format_attr_mtc_period.attr, &format_attr_cyc_thresh.attr, &format_attr_psb_period.attr, @@ -197,6 +202,19 @@ static int __init pt_pmu_hw_init(void) pt_pmu.tsc_art_den = eax; } + /* model-specific quirks */ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_CORE: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL_X: + /* not setting BRANCH_EN will #GP, erratum BDM106 */ + pt_pmu.branch_en_always_on = true; + break; + default: + break; + } + if (boot_cpu_has(X86_FEATURE_VMX)) { /* * Intel SDM, 36.5 "Tracing post-VMXON" says that @@ -263,8 +281,20 @@ fail: #define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ RTIT_CTL_FUP_ON_PTW) -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ +/* + * Bit 0 (TraceEn) in the attr.config is meaningless as the + * corresponding bit in the RTIT_CTL can only be controlled + * by the driver; therefore, repurpose it to mean: pass + * through the bit that was previously assumed to be always + * on for PT, thereby allowing the user to *not* set it if + * they so wish. See also pt_event_valid() and pt_config(). + */ +#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN + +#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \ + RTIT_CTL_TSC_EN | \ RTIT_CTL_DISRETC | \ + RTIT_CTL_BRANCH_EN | \ RTIT_CTL_CYC_PSB | \ RTIT_CTL_MTC | \ RTIT_CTL_PWR_EVT_EN | \ @@ -332,6 +362,33 @@ static bool pt_event_valid(struct perf_event *event) return false; } + /* + * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config + * clears the assomption that BranchEn must always be enabled, + * as was the case with the first implementation of PT. + * If this bit is not set, the legacy behavior is preserved + * for compatibility with the older userspace. + * + * Re-using bit 0 for this purpose is fine because it is never + * directly set by the user; previous attempts at setting it in + * the attr.config resulted in -EINVAL. + */ + if (config & RTIT_CTL_PASSTHROUGH) { + /* + * Disallow not setting BRANCH_EN where BRANCH_EN is + * always required. + */ + if (pt_pmu.branch_en_always_on && + !(config & RTIT_CTL_BRANCH_EN)) + return false; + } else { + /* + * Disallow BRANCH_EN without the PASSTHROUGH. + */ + if (config & RTIT_CTL_BRANCH_EN) + return false; + } + return true; } @@ -411,6 +468,7 @@ static u64 pt_config_filters(struct perf_event *event) static void pt_config(struct perf_event *event) { + struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; if (!event->hw.itrace_started) { @@ -419,7 +477,20 @@ static void pt_config(struct perf_event *event) } reg = pt_config_filters(event); - reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; + reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN; + + /* + * Previously, we had BRANCH_EN on by default, but now that PT has + * grown features outside of branch tracing, it is useful to allow + * the user to disable it. Setting bit 0 in the event's attr.config + * allows BRANCH_EN to pass through instead of being always on. See + * also the comment in pt_event_valid(). + */ + if (event->attr.config & BIT(0)) { + reg |= event->attr.config & RTIT_CTL_BRANCH_EN; + } else { + reg |= RTIT_CTL_BRANCH_EN; + } if (!event->attr.exclude_kernel) reg |= RTIT_CTL_OS; @@ -429,11 +500,15 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK); event->hw.config = reg; - wrmsrl(MSR_IA32_RTIT_CTL, reg); + if (READ_ONCE(pt->vmx_on)) + perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); + else + wrmsrl(MSR_IA32_RTIT_CTL, reg); } static void pt_config_stop(struct perf_event *event) { + struct pt *pt = this_cpu_ptr(&pt_ctx); u64 ctl = READ_ONCE(event->hw.config); /* may be already stopped by a PMI */ @@ -441,7 +516,8 @@ static void pt_config_stop(struct perf_event *event) return; ctl &= ~RTIT_CTL_TRACEEN; - wrmsrl(MSR_IA32_RTIT_CTL, ctl); + if (!READ_ONCE(pt->vmx_on)) + wrmsrl(MSR_IA32_RTIT_CTL, ctl); WRITE_ONCE(event->hw.config, ctl); @@ -753,7 +829,8 @@ static void pt_handle_status(struct pt *pt) */ if (!pt_cap_get(PT_CAP_topa_multiple_entries) || buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { - local_inc(&buf->lost); + perf_aux_output_flag(&pt->handle, + PERF_AUX_FLAG_TRUNCATED); advance++; } } @@ -846,8 +923,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, /* can't stop in the middle of an output region */ if (buf->output_off + handle->size + 1 < - sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) + sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); return -EINVAL; + } /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ @@ -1171,12 +1250,6 @@ void intel_pt_interrupt(void) if (!READ_ONCE(pt->handle_nmi)) return; - /* - * If VMX is on and PT does not support it, don't touch anything. - */ - if (READ_ONCE(pt->vmx_on)) - return; - if (!event) return; @@ -1192,8 +1265,7 @@ void intel_pt_interrupt(void) pt_update_head(pt); - perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), - local_xchg(&buf->lost, 0)); + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); if (!event->hw.state) { int ret; @@ -1208,7 +1280,7 @@ void intel_pt_interrupt(void) /* snapshot counters don't use PMI, so it's safe */ ret = pt_buffer_reset_markers(buf, &pt->handle); if (ret) { - perf_aux_output_end(&pt->handle, 0, true); + perf_aux_output_end(&pt->handle, 0); return; } @@ -1237,12 +1309,19 @@ void intel_pt_handle_vmx(int on) local_irq_save(flags); WRITE_ONCE(pt->vmx_on, on); - if (on) { - /* prevent pt_config_stop() from writing RTIT_CTL */ - event = pt->handle.event; - if (event) - event->hw.config = 0; - } + /* + * If an AUX transaction is in progress, it will contain + * gap(s), so flag it PARTIAL to inform the user. + */ + event = pt->handle.event; + if (event) + perf_aux_output_flag(&pt->handle, + PERF_AUX_FLAG_PARTIAL); + + /* Turn PTs back on */ + if (!on && event) + wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); @@ -1257,9 +1336,6 @@ static void pt_event_start(struct perf_event *event, int mode) struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf; - if (READ_ONCE(pt->vmx_on)) - return; - buf = perf_aux_output_begin(&pt->handle, event); if (!buf) goto fail_stop; @@ -1280,7 +1356,7 @@ static void pt_event_start(struct perf_event *event, int mode) return; fail_end_stop: - perf_aux_output_end(&pt->handle, 0, true); + perf_aux_output_end(&pt->handle, 0); fail_stop: hwc->state = PERF_HES_STOPPED; } @@ -1321,8 +1397,7 @@ static void pt_event_stop(struct perf_event *event, int mode) pt->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), - local_xchg(&buf->lost, 0)); + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); } } diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 53473c21b554..0eb41d07b79a 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -110,6 +110,7 @@ struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; bool vmx; + bool branch_en_always_on; unsigned long max_nonturbo_ratio; unsigned int tsc_art_num; unsigned int tsc_art_den; @@ -143,7 +144,6 @@ struct pt_buffer { size_t output_off; unsigned long nr_pages; local_t data_size; - local_t lost; local64_t head; bool snapshot; unsigned long stop_pos, intr_pos; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 22054ca49026..9d05c7e67f60 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -1,5 +1,5 @@ /* - * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Support Intel RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index ad986c1e29bc..df5989f27b1b 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -360,7 +360,7 @@ extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; extern struct event_constraint uncore_constraint_empty; -/* perf_event_intel_uncore_snb.c */ +/* uncore_snb.c */ int snb_uncore_pci_init(void); int ivb_uncore_pci_init(void); int hsw_uncore_pci_init(void); @@ -371,7 +371,7 @@ void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); int snb_pci2phy_map_init(int devid); -/* perf_event_intel_uncore_snbep.c */ +/* uncore_snbep.c */ int snbep_uncore_pci_init(void); void snbep_uncore_cpu_init(void); int ivbep_uncore_pci_init(void); @@ -385,5 +385,5 @@ void knl_uncore_cpu_init(void); int skx_uncore_pci_init(void); void skx_uncore_cpu_init(void); -/* perf_event_intel_uncore_nhmex.c */ +/* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index bcbb1d2ae10b..be3d36254040 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -79,6 +79,7 @@ struct amd_nb { /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 +#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* * Flags PEBS can handle without an PMI. diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index db64baf0e500..5b882cc0c0e9 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -25,47 +25,24 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> +#include <linux/hyperv.h> - -#ifdef CONFIG_X86_64 +#ifdef CONFIG_HYPERV_TSCPAGE static struct ms_hyperv_tsc_page *tsc_pg; +struct ms_hyperv_tsc_page *hv_get_tsc_page(void) +{ + return tsc_pg; +} + static u64 read_hv_clock_tsc(struct clocksource *arg) { - u64 current_tick; + u64 current_tick = hv_read_tsc_page(tsc_pg); + + if (current_tick == U64_MAX) + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - if (tsc_pg->tsc_sequence != 0) { - /* - * Use the tsc page to compute the value. - */ - - while (1) { - u64 tmp; - u32 sequence = tsc_pg->tsc_sequence; - u64 cur_tsc; - u64 scale = tsc_pg->tsc_scale; - s64 offset = tsc_pg->tsc_offset; - - rdtscll(cur_tsc); - /* current_tick = ((cur_tsc *scale) >> 64) + offset */ - asm("mulq %3" - : "=d" (current_tick), "=a" (tmp) - : "a" (cur_tsc), "r" (scale)); - - current_tick += offset; - if (tsc_pg->tsc_sequence == sequence) - return current_tick; - - if (tsc_pg->tsc_sequence != 0) - continue; - /* - * Fallback using MSR method. - */ - break; - } - } - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); return current_tick; } @@ -139,7 +116,7 @@ void hyperv_init(void) /* * Register Hyper-V specific clocksource. */ -#ifdef CONFIG_X86_64 +#ifdef CONFIG_HYPERV_TSCPAGE if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { union hv_x64_msr_hypercall_contents tsc_msr; @@ -155,16 +132,19 @@ void hyperv_init(void) tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + + hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK; + clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); return; } +register_msr_cs: #endif /* * For 32 bit guests just use the MSR based mechanism for reading * the partition counter. */ -register_msr_cs: hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 395b69551fce..2efc768e4362 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -52,6 +52,8 @@ extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; void acpi_pic_sci_set_trigger(unsigned int, u16); +struct device; + extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity); extern void (*__acpi_unregister_gsi)(u32 gsi); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 730ef65e8393..bdffcd9eab2b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -252,12 +252,6 @@ static inline int x2apic_enabled(void) { return 0; } #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ -#ifdef CONFIG_X86_64 -#define SET_APIC_ID(x) (apic->set_apic_id(x)) -#else - -#endif - /* * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 @@ -299,6 +293,7 @@ struct apic { int (*phys_pkg_id)(int cpuid_apic, int index_msb); unsigned int (*get_apic_id)(unsigned long x); + /* Can't be NULL on 64-bit */ unsigned long (*set_apic_id)(unsigned int id); int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 14635c5ea025..caa5798c92f4 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) return cmpxchg(&v->counter, old, new); } +#define atomic_try_cmpxchg atomic_try_cmpxchg +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline int atomic_xchg(atomic_t *v, int new) { return xchg(&v->counter, new); @@ -201,16 +207,12 @@ static inline void atomic_##op(int i, atomic_t *v) \ } #define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ +static inline int atomic_fetch_##op(int i, atomic_t *v) \ { \ - int old, val = atomic_read(v); \ - for (;;) { \ - old = atomic_cmpxchg(v, val, val c_op i); \ - if (old == val) \ - break; \ - val = old; \ - } \ - return old; \ + int val = atomic_read(v); \ + do { \ + } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ + return val; \ } #define ATOMIC_OPS(op, c_op) \ @@ -236,16 +238,11 @@ ATOMIC_OPS(xor, ^) */ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) + int c = atomic_read(v); + do { + if (unlikely(c == u)) break; - c = old; - } + } while (!atomic_try_cmpxchg(v, &c, c + a)); return c; } diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 89ed2f6ae2f7..6189a433c9a9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) return cmpxchg(&v->counter, old, new); } +#define atomic64_try_cmpxchg atomic64_try_cmpxchg +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline long atomic64_xchg(atomic64_t *v, long new) { return xchg(&v->counter, new); @@ -192,17 +198,12 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c, old; - c = atomic64_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic64_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); + long c = atomic64_read(v); + do { + if (unlikely(c == u)) + return false; + } while (!atomic64_try_cmpxchg(v, &c, c + a)); + return true; } #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) @@ -216,17 +217,12 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long c, old, dec; - c = atomic64_read(v); - for (;;) { + long dec, c = atomic64_read(v); + do { dec = c - 1; if (unlikely(dec < 0)) break; - old = atomic64_cmpxchg((v), c, dec); - if (likely(old == c)) - break; - c = old; - } + } while (!atomic64_try_cmpxchg(v, &c, dec)); return dec; } @@ -242,14 +238,10 @@ static inline void atomic64_##op(long i, atomic64_t *v) \ #define ATOMIC64_FETCH_OP(op, c_op) \ static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ { \ - long old, val = atomic64_read(v); \ - for (;;) { \ - old = atomic64_cmpxchg(v, val, val c_op i); \ - if (old == val) \ - break; \ - val = old; \ - } \ - return old; \ + long val = atomic64_read(v); \ + do { \ + } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ + return val; \ } #define ATOMIC64_OPS(op, c_op) \ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index ba38ebbaced3..39e702d90cdb 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,36 +1,82 @@ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H -#define HAVE_ARCH_BUG +#include <linux/stringify.h> -#ifdef CONFIG_DEBUG_BUGVERBOSE +/* + * Since some emulators terminate on UD2, we cannot use it for WARN. + * Since various instruction decoders disagree on the length of UD1, + * we cannot use it either. So use UD0 for WARN. + * + * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas + * our kernel decoder thinks it takes a ModRM byte, which seems consistent + * with various things like the Intel SDM instruction encoding rules) + */ + +#define ASM_UD0 ".byte 0x0f, 0xff" +#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */ +#define ASM_UD2 ".byte 0x0f, 0x0b" + +#define INSN_UD0 0xff0f +#define INSN_UD2 0x0b0f + +#define LEN_UD0 2 + +#ifdef CONFIG_GENERIC_BUG #ifdef CONFIG_X86_32 -# define __BUG_C0 "2:\t.long 1b, %c0\n" +# define __BUG_REL(val) ".long " __stringify(val) #else -# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" +# define __BUG_REL(val) ".long " __stringify(val) " - 2b" #endif -#define BUG() \ -do { \ - asm volatile("1:\tud2\n" \ - ".pushsection __bug_table,\"a\"\n" \ - __BUG_C0 \ - "\t.word %c1, 0\n" \ - "\t.org 2b+%c2\n" \ - ".popsection" \ - : : "i" (__FILE__), "i" (__LINE__), \ - "i" (sizeof(struct bug_entry))); \ - unreachable(); \ +#ifdef CONFIG_DEBUG_BUGVERBOSE + +#define _BUG_FLAGS(ins, flags) \ +do { \ + asm volatile("1:\t" ins "\n" \ + ".pushsection __bug_table,\"a\"\n" \ + "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ + "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ + "\t.word %c1" "\t# bug_entry::line\n" \ + "\t.word %c2" "\t# bug_entry::flags\n" \ + "\t.org 2b+%c3\n" \ + ".popsection" \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ + "i" (sizeof(struct bug_entry))); \ } while (0) +#else /* !CONFIG_DEBUG_BUGVERBOSE */ + +#define _BUG_FLAGS(ins, flags) \ +do { \ + asm volatile("1:\t" ins "\n" \ + ".pushsection __bug_table,\"a\"\n" \ + "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ + "\t.word %c0" "\t# bug_entry::flags\n" \ + "\t.org 2b+%c1\n" \ + ".popsection" \ + : : "i" (flags), \ + "i" (sizeof(struct bug_entry))); \ +} while (0) + +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + #else + +#define _BUG_FLAGS(ins, flags) asm volatile(ins) + +#endif /* CONFIG_GENERIC_BUG */ + +#define HAVE_ARCH_BUG #define BUG() \ do { \ - asm volatile("ud2"); \ + _BUG_FLAGS(ASM_UD2, 0); \ unreachable(); \ } while (0) -#endif + +#define __WARN_FLAGS(flags) _BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags)) #include <asm-generic/bug.h> diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e7e1942edff7..8b4140f6724f 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,93 +5,8 @@ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> -/* - * The set_memory_* API can be used to change various attributes of a virtual - * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack - * Executability : eXeutable, NoteXecutable - * Read/Write : ReadOnly, ReadWrite - * Presence : NotPresent - * - * Within a category, the attributes are mutually exclusive. - * - * The implementation of this API will take care of various aspects that - * are associated with changing such attributes, such as: - * - Flushing TLBs - * - Flushing CPU caches - * - Making sure aliases of the memory behind the mapping don't violate - * coherency rules as defined by the CPU in the system. - * - * What this API does not do: - * - Provide exclusion between various callers - including callers that - * operation on other mappings of the same physical page - * - Restore default attributes when a page is freed - * - Guarantee that mappings other than the requested one are - * in any state, other than that these do not violate rules for - * the CPU you have. Do not depend on any effects on other mappings, - * CPUs other than the one you have may have more relaxed rules. - * The caller is required to take care of these. - */ - -int _set_memory_uc(unsigned long addr, int numpages); -int _set_memory_wc(unsigned long addr, int numpages); -int _set_memory_wt(unsigned long addr, int numpages); -int _set_memory_wb(unsigned long addr, int numpages); -int set_memory_uc(unsigned long addr, int numpages); -int set_memory_wc(unsigned long addr, int numpages); -int set_memory_wt(unsigned long addr, int numpages); -int set_memory_wb(unsigned long addr, int numpages); -int set_memory_x(unsigned long addr, int numpages); -int set_memory_nx(unsigned long addr, int numpages); -int set_memory_ro(unsigned long addr, int numpages); -int set_memory_rw(unsigned long addr, int numpages); -int set_memory_np(unsigned long addr, int numpages); -int set_memory_4k(unsigned long addr, int numpages); - -int set_memory_array_uc(unsigned long *addr, int addrinarray); -int set_memory_array_wc(unsigned long *addr, int addrinarray); -int set_memory_array_wt(unsigned long *addr, int addrinarray); -int set_memory_array_wb(unsigned long *addr, int addrinarray); - -int set_pages_array_uc(struct page **pages, int addrinarray); -int set_pages_array_wc(struct page **pages, int addrinarray); -int set_pages_array_wt(struct page **pages, int addrinarray); -int set_pages_array_wb(struct page **pages, int addrinarray); - -/* - * For legacy compatibility with the old APIs, a few functions - * are provided that work on a "struct page". - * These functions operate ONLY on the 1:1 kernel mapping of the - * memory that the struct page represents, and internally just - * call the set_memory_* function. See the description of the - * set_memory_* function for more details on conventions. - * - * These APIs should be considered *deprecated* and are likely going to - * be removed in the future. - * The reason for this is the implicit operation on the 1:1 mapping only, - * making this not a generally useful API. - * - * Specifically, many users of the old APIs had a virtual address, - * called virt_to_page() or vmalloc_to_page() on that address to - * get a struct page* that the old API required. - * To convert these cases, use set_memory_*() on the original - * virtual address, do not use these functions. - */ - -int set_pages_uc(struct page *page, int numpages); -int set_pages_wb(struct page *page, int numpages); -int set_pages_x(struct page *page, int numpages); -int set_pages_nx(struct page *page, int numpages); -int set_pages_ro(struct page *page, int numpages); -int set_pages_rw(struct page *page, int numpages); - - void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -extern int kernel_set_to_readonly; -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index eae33c7170c8..47bea8cadbd0 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -6,7 +6,8 @@ #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */ -#define VCLOCK_MAX 2 +#define VCLOCK_HVCLOCK 3 /* vDSO should use vread_hvclock. */ +#define VCLOCK_MAX 3 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 97848cdfcb1a..d90296d061e8 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -153,6 +153,76 @@ extern void __add_wrong_size(void) #define cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) + +#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ +({ \ + bool success; \ + __typeof__(_ptr) _old = (_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(_ptr); \ + asm volatile(lock "cmpxchgb %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "q" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(_ptr); \ + asm volatile(lock "cmpxchgw %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(_ptr); \ + asm volatile(lock "cmpxchgl %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(_ptr); \ + asm volatile(lock "cmpxchgq %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); \ +}) + +#define __try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) + +#define try_cmpxchg(ptr, pold, new) \ + __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + /* * xadd() adds "inc" to "*ptr" and atomically returns the previous * value of "*ptr". diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4e7772387c6e..2701e5f8145b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -187,6 +187,7 @@ * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ @@ -201,6 +202,8 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ @@ -289,7 +292,8 @@ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ +#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 29e53ea7d764..ed8b66de541f 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -125,16 +125,6 @@ static inline void le128_inc(le128 *i) i->b = cpu_to_le64(b); } -static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src) -{ - u64 a = le64_to_cpu(src->a); - u64 b = le64_to_cpu(src->b); - u64 _tt = ((s64)a >> 63) & 0x87; - - dst->a = cpu_to_le64((a << 1) ^ (b >> 63)); - dst->b = cpu_to_le64((b << 1) ^ _tt); -} - extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, struct scatterlist *dst, diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1548ca92ad3f..d0a21b12dd58 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -4,6 +4,7 @@ #include <asm/desc_defs.h> #include <asm/ldt.h> #include <asm/mmu.h> +#include <asm/fixmap.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -45,11 +46,43 @@ struct gdt_page { DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +/* Provide the original GDT */ +static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) { return per_cpu(gdt_page, cpu).gdt; } +/* Provide the current original GDT */ +static inline struct desc_struct *get_current_gdt_rw(void) +{ + return this_cpu_ptr(&gdt_page)->gdt; +} + +/* Get the fixmap index for a specific processor */ +static inline unsigned int get_cpu_gdt_ro_index(int cpu) +{ + return FIX_GDT_REMAP_BEGIN + cpu; +} + +/* Provide the fixmap address of the remapped GDT */ +static inline struct desc_struct *get_cpu_gdt_ro(int cpu) +{ + unsigned int idx = get_cpu_gdt_ro_index(cpu); + return (struct desc_struct *)__fix_to_virt(idx); +} + +/* Provide the current read-only GDT */ +static inline struct desc_struct *get_current_gdt_ro(void) +{ + return get_cpu_gdt_ro(smp_processor_id()); +} + +/* Provide the physical address of the GDT page. */ +static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) +{ + return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); +} + #ifdef CONFIG_X86_64 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, @@ -174,7 +207,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) { - struct desc_struct *d = get_cpu_gdt_table(cpu); + struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, @@ -194,22 +227,90 @@ static inline void native_set_ldt(const void *addr, unsigned int entries) set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, entries * LDT_ENTRY_SIZE - 1); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT, &ldt, DESC_LDT); asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); } } +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +/* + * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is + * a read-only remapping. To prevent a page fault, the GDT is switched to the + * original writeable version when needed. + */ +#ifdef CONFIG_X86_64 +static inline void native_load_tr_desc(void) +{ + struct desc_ptr gdt; + int cpu = raw_smp_processor_id(); + bool restore = 0; + struct desc_struct *fixmap_gdt; + + native_store_gdt(&gdt); + fixmap_gdt = get_cpu_gdt_ro(cpu); + + /* + * If the current GDT is the read-only fixmap, swap to the original + * writeable version. Swap back at the end. + */ + if (gdt.address == (unsigned long)fixmap_gdt) { + load_direct_gdt(cpu); + restore = 1; + } + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); + if (restore) + load_fixmap_gdt(cpu); +} +#else static inline void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } +#endif + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + + asm volatile("str %0":"=r" (tr)); + + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_rw(cpu); + unsigned int i; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} DECLARE_PER_CPU(bool, __tss_limit_invalid); static inline void force_reload_TR(void) { - struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); + struct desc_struct *d = get_current_gdt_rw(); tss_desc tss; memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); @@ -257,44 +358,6 @@ static inline void invalidate_tss_limit(void) this_cpu_write(__tss_limit_invalid, true); } -static inline void native_load_gdt(const struct desc_ptr *dtr) -{ - asm volatile("lgdt %0"::"m" (*dtr)); -} - -static inline void native_load_idt(const struct desc_ptr *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} - -static inline void native_store_gdt(struct desc_ptr *dtr) -{ - asm volatile("sgdt %0":"=m" (*dtr)); -} - -static inline void native_store_idt(struct desc_ptr *dtr) -{ - asm volatile("sidt %0":"=m" (*dtr)); -} - -static inline unsigned long native_store_tr(void) -{ - unsigned long tr; - - asm volatile("str %0":"=r" (tr)); - - return tr; -} - -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - unsigned int i; - - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; -} - /* This intentionally ignores lm, since 32-bit apps don't have that field. */ #define LDT_empty(info) \ ((info)->base_addr == 0 && \ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 85599ad4d024..5dff775af7cd 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -36,6 +36,12 @@ # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ +#ifdef CONFIG_X86_5LEVEL +# define DISABLE_LA57 0 +#else +# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -55,7 +61,7 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) #define DISABLED_MASK17 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h deleted file mode 100644 index 67313f3a9874..000000000000 --- a/arch/x86/include/asm/e820.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef _ASM_X86_E820_H -#define _ASM_X86_E820_H - -/* - * E820_X_MAX is the maximum size of the extended E820 table. The extended - * table may contain up to 3 extra E820 entries per possible NUMA node, so we - * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128. - * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h. - */ -#include <linux/numa.h> -#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) - -#include <uapi/asm/e820.h> - -#ifndef __ASSEMBLY__ -/* see comment in arch/x86/kernel/e820.c */ -extern struct e820map *e820; -extern struct e820map *e820_saved; - -extern unsigned long pci_mem_start; -extern int e820_any_mapped(u64 start, u64 end, unsigned type); -extern int e820_all_mapped(u64 start, u64 end, unsigned type); -extern void e820_add_region(u64 start, u64 size, int type); -extern void e820_print_map(char *who); -extern int -sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); -extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); -extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, - int checktype); -extern void update_e820(void); -extern void e820_setup_gap(void); -struct setup_data; -extern void parse_e820_ext(u64 phys_addr, u32 data_len); - -#if defined(CONFIG_X86_64) || \ - (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) -extern void e820_mark_nosave_regions(unsigned long limit_pfn); -#else -static inline void e820_mark_nosave_regions(unsigned long limit_pfn) -{ -} -#endif - -extern unsigned long e820_end_of_ram_pfn(void); -extern unsigned long e820_end_of_low_ram_pfn(void); -extern u64 early_reserve_e820(u64 sizet, u64 align); - -void memblock_x86_fill(void); -void memblock_find_dma_reserve(void); - -extern void finish_e820_parsing(void); -extern void e820_reserve_resources(void); -extern void e820_reserve_resources_late(void); -extern void setup_memory_map(void); -extern char *default_machine_specific_memory_setup(void); - -extern void e820_reallocate_tables(void); - -/* - * Returns true iff the specified range [s,e) is completely contained inside - * the ISA region. - */ -static inline bool is_ISA_range(u64 s, u64 e) -{ - return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS; -} - -#endif /* __ASSEMBLY__ */ -#include <linux/ioport.h> - -#define HIGH_MEMORY (1024*1024) -#endif /* _ASM_X86_E820_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h new file mode 100644 index 000000000000..8e0f8b85b209 --- /dev/null +++ b/arch/x86/include/asm/e820/api.h @@ -0,0 +1,50 @@ +#ifndef _ASM_E820_API_H +#define _ASM_E820_API_H + +#include <asm/e820/types.h> + +extern struct e820_table *e820_table; +extern struct e820_table *e820_table_firmware; + +extern unsigned long pci_mem_start; + +extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type); +extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); + +extern void e820__range_add (u64 start, u64 size, enum e820_type type); +extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); +extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); + +extern void e820__print_table(char *who); +extern int e820__update_table(struct e820_table *table); +extern void e820__update_table_print(void); + +extern unsigned long e820__end_of_ram_pfn(void); +extern unsigned long e820__end_of_low_ram_pfn(void); + +extern u64 e820__memblock_alloc_reserved(u64 size, u64 align); +extern void e820__memblock_setup(void); + +extern void e820__reserve_setup_data(void); +extern void e820__finish_early_params(void); +extern void e820__reserve_resources(void); +extern void e820__reserve_resources_late(void); + +extern void e820__memory_setup(void); +extern void e820__memory_setup_extended(u64 phys_addr, u32 data_len); +extern char *e820__memory_setup_default(void); +extern void e820__setup_pci_gap(void); + +extern void e820__reallocate_tables(void); +extern void e820__register_nosave_regions(unsigned long limit_pfn); + +/* + * Returns true iff the specified range [start,end) is completely contained inside + * the ISA region. + */ +static inline bool is_ISA_range(u64 start, u64 end) +{ + return start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS; +} + +#endif /* _ASM_E820_API_H */ diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h new file mode 100644 index 000000000000..4adeed03a9a1 --- /dev/null +++ b/arch/x86/include/asm/e820/types.h @@ -0,0 +1,104 @@ +#ifndef _ASM_E820_TYPES_H +#define _ASM_E820_TYPES_H + +#include <uapi/asm/bootparam.h> + +/* + * These are the E820 types known to the kernel: + */ +enum e820_type { + E820_TYPE_RAM = 1, + E820_TYPE_RESERVED = 2, + E820_TYPE_ACPI = 3, + E820_TYPE_NVS = 4, + E820_TYPE_UNUSABLE = 5, + E820_TYPE_PMEM = 7, + + /* + * This is a non-standardized way to represent ADR or + * NVDIMM regions that persist over a reboot. + * + * The kernel will ignore their special capabilities + * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * + * ( Note that older platforms also used 6 for the same + * type of memory, but newer versions switched to 12 as + * 6 was assigned differently. Some time they will learn... ) + */ + E820_TYPE_PRAM = 12, + + /* + * Reserved RAM used by the kernel itself if + * CONFIG_INTEL_TXT=y is enabled, memory of this type + * will be included in the S3 integrity calculation + * and so should not include any memory that the BIOS + * might alter over the S3 transition: + */ + E820_TYPE_RESERVED_KERN = 128, +}; + +/* + * A single E820 map entry, describing a memory range of [addr...addr+size-1], + * of 'type' memory type: + * + * (We pack it because there can be thousands of them on large systems.) + */ +struct e820_entry { + u64 addr; + u64 size; + enum e820_type type; +} __attribute__((packed)); + +/* + * The legacy E820 BIOS limits us to 128 (E820_MAX_ENTRIES_ZEROPAGE) nodes + * due to the constrained space in the zeropage. + * + * On large systems we can easily have thousands of nodes with RAM, + * which cannot be fit into so few entries - so we have a mechanism + * to extend the e820 table size at build-time, via the E820_MAX_ENTRIES + * define below. + * + * ( Those extra entries are enumerated via the EFI memory map, not + * via the legacy zeropage mechanism. ) + * + * Size our internal memory map tables to have room for these additional + * entries, based on a heuristic calculation: up to three entries per + * NUMA node, plus E820_MAX_ENTRIES_ZEROPAGE for some extra space. + * + * This allows for bootstrap/firmware quirks such as possible duplicate + * E820 entries that might need room in the same arrays, prior to the + * call to e820__update_table() to remove duplicates. The allowance + * of three memory map entries per node is "enough" entries for + * the initial hardware platform motivating this mechanism to make + * use of additional EFI map entries. Future platforms may want + * to allow more than three entries per node or otherwise refine + * this size. + */ + +#include <linux/numa.h> + +#define E820_MAX_ENTRIES (E820_MAX_ENTRIES_ZEROPAGE + 3*MAX_NUMNODES) + +/* + * The whole array of E820 entries: + */ +struct e820_table { + __u32 nr_entries; + struct e820_entry entries[E820_MAX_ENTRIES]; +}; + +/* + * Various well-known legacy memory ranges in physical memory: + */ +#define ISA_START_ADDRESS 0x000a0000 +#define ISA_END_ADDRESS 0x00100000 + +#define BIOS_BEGIN 0x000a0000 +#define BIOS_END 0x00100000 + +#define HIGH_MEMORY 0x00100000 + +#define BIOS_ROM_BASE 0xffe00000 +#define BIOS_ROM_END 0xffffffff + +#endif /* _ASM_E820_TYPES_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9d49c18b5ea9..e8ab9a46bc68 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -287,14 +287,29 @@ struct task_struct; #define ARCH_DLINFO_IA32 \ do { \ - if (vdso32_enabled) { \ + if (VDSO_CURRENT_BASE) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ } while (0) +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static inline int mmap_is_ia32(void) +{ + return IS_ENABLED(CONFIG_X86_32) || + (IS_ENABLED(CONFIG_COMPAT) && + test_thread_flag(TIF_ADDR32)); +} + +extern unsigned long tasksize_32bit(void); +extern unsigned long tasksize_64bit(void); +extern unsigned long get_mmap_base(int is_legacy); + #ifdef CONFIG_X86_32 +#define __STACK_RND_MASK(is32bit) (0x7ff) #define STACK_RND_MASK (0x7ff) #define ARCH_DLINFO ARCH_DLINFO_IA32 @@ -304,7 +319,8 @@ do { \ #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ -#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) +#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) +#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ do { \ @@ -348,16 +364,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static inline int mmap_is_ia32(void) -{ - return IS_ENABLED(CONFIG_X86_32) || - (IS_ENABLED(CONFIG_COMPAT) && - test_thread_flag(TIF_ADDR32)); -} - /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8554f960e21b..b65155cc3760 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -100,6 +100,10 @@ enum fixed_addresses { #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif + /* Fixmap entries to remap the GDTs, one per processor. */ + FIX_GDT_REMAP_BEGIN, + FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + __end_of_permanent_fixed_addresses, /* diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 156cd5d18d2a..1d268098ac2e 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_GART_H #define _ASM_X86_GART_H -#include <asm/e820.h> +#include <asm/e820/api.h> extern void set_up_gart_resume(u32, u32); @@ -97,7 +97,7 @@ static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n"); return 0; } - if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + if (e820__mapped_any(aper_base, aper_base + aper_size, E820_TYPE_RAM)) { printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n"); return 0; } diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 67942b6ad4b7..21126155a739 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -35,9 +35,6 @@ struct hypervisor_x86 { /* Detection routine */ uint32_t (*detect)(void); - /* Adjust CPU feature bits (run once per CPU) */ - void (*set_cpu_features)(struct cpuinfo_x86 *); - /* Platform setup (run once per boot) */ void (*init_platform)(void); @@ -53,15 +50,14 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen; +extern const struct hypervisor_x86 x86_hyper_xen_pv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; -extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); extern void hypervisor_pin_vcpu(int cpu); #else -static inline void init_hypervisor(struct cpuinfo_x86 *c) { } static inline void init_hypervisor_platform(void) { } static inline bool hypervisor_x2apic_available(void) { return false; } #endif /* CONFIG_HYPERVISOR_GUEST */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9814db42b790..75b748a1deb8 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -12,6 +12,7 @@ */ #define INTEL_FAM6_CORE_YONAH 0x0E + #define INTEL_FAM6_CORE2_MEROM 0x0F #define INTEL_FAM6_CORE2_MEROM_L 0x16 #define INTEL_FAM6_CORE2_PENRYN 0x17 @@ -21,6 +22,7 @@ #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ #define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_FAM6_NEHALEM_EX 0x2E + #define INTEL_FAM6_WESTMERE 0x25 #define INTEL_FAM6_WESTMERE_EP 0x2C #define INTEL_FAM6_WESTMERE_EX 0x2F @@ -36,9 +38,9 @@ #define INTEL_FAM6_HASWELL_GT3E 0x46 #define INTEL_FAM6_BROADWELL_CORE 0x3D -#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_BROADWELL_GT3E 0x47 #define INTEL_FAM6_BROADWELL_X 0x4F +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_SKYLAKE_MOBILE 0x4E #define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E @@ -59,8 +61,8 @@ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ #define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ +#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A /* Xeon Phi */ diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index 4291b6a5ddf7..fac89eb78a6b 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -23,6 +23,11 @@ #define IPC_ERR_EMSECURITY 6 #define IPC_ERR_UNSIGNEDKERNEL 7 +/* GCR reg offsets from gcr base*/ +#define PMC_GCR_PMC_CFG_REG 0x08 +#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 +#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 + #if IS_ENABLED(CONFIG_INTEL_PMC_IPC) int intel_pmc_ipc_simple_command(int cmd, int sub); @@ -31,6 +36,9 @@ int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, u32 *out, u32 outlen); int intel_pmc_s0ix_counter_read(u64 *data); +int intel_pmc_gcr_read(u32 offset, u32 *data); +int intel_pmc_gcr_write(u32 offset, u32 data); +int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val); #else @@ -56,6 +64,21 @@ static inline int intel_pmc_s0ix_counter_read(u64 *data) return -EINVAL; } +static inline int intel_pmc_gcr_read(u32 offset, u32 *data) +{ + return -EINVAL; +} + +static inline int intel_pmc_gcr_write(u32 offset, u32 data) +{ + return -EINVAL; +} + +static inline int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val) +{ + return -EINVAL; +} + #endif /*CONFIG_INTEL_PMC_IPC*/ #endif diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 0d64397cee58..597dc4995678 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -12,6 +12,7 @@ #define IA32_L3_QOS_CFG 0xc81 #define IA32_L3_CBM_BASE 0xc90 #define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 #define L3_QOS_CDP_ENABLE 0x01ULL @@ -37,23 +38,30 @@ struct rdtgroup { /* rdtgroup.flags */ #define RDT_DELETED 1 +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + /* List of all resource groups */ extern struct list_head rdt_all_groups; +extern int max_name_width, max_data_width; + int __init rdtgroup_init(void); /** * struct rftype - describe each file in the resctrl file system - * @name: file name - * @mode: access mode - * @kf_ops: operations - * @seq_show: show content of the file - * @write: write to the file + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @seq_show: Show content of the file + * @write: Write to the file */ struct rftype { char *name; umode_t mode; struct kernfs_ops *kf_ops; + unsigned long flags; int (*seq_show)(struct kernfs_open_file *of, struct seq_file *sf, void *v); @@ -67,54 +75,21 @@ struct rftype { }; /** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @max_cbm: Largest Cache Bit Mask allowed - * @min_cbm_bits: Minimum number of consecutive bits to be set - * in a cache bit mask - * @domains: All domains for this resource - * @num_domains: Number of domains active - * @msr_base: Base MSR address for CBMs - * @tmp_cbms: Scratch space when updating schemata - * @num_tmp_cbms: Number of CBMs in tmp_cbms - * @cache_level: Which cache level defines scope of this domain - * @cbm_idx_multi: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cbm_len; - int min_cbm_bits; - u32 max_cbm; - struct list_head domains; - int num_domains; - int msr_base; - u32 *tmp_cbms; - int num_tmp_cbms; - int cache_level; - int cbm_idx_multi; - int cbm_idx_offset; -}; - -/** * struct rdt_domain - group of cpus sharing an RDT resource * @list: all instances of this resource * @id: unique id for this instance * @cpu_mask: which cpus share this resource - * @cbm: array of cache bit masks (indexed by CLOSID) + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain */ struct rdt_domain { struct list_head list; int id; struct cpumask cpu_mask; - u32 *cbm; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; }; /** @@ -129,6 +104,83 @@ struct msr_param { int high; }; +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @capable: Is this feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @info_files: resctrl info files for the resource + * @nr_info_files: Number of info files + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + */ +struct rdt_resource { + bool enabled; + bool capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + struct rftype *info_files; + int nr_info_files; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); +}; + +void rdt_get_cache_infofile(struct rdt_resource *r); +void rdt_get_mba_infofile(struct rdt_resource *r); +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + extern struct mutex rdtgroup_mutex; extern struct rdt_resource rdt_resources_all[]; @@ -142,6 +194,7 @@ enum { RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE, RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, /* Must be the last */ RDT_NUM_RESOURCES, @@ -149,7 +202,7 @@ enum { #define for_each_capable_rdt_resource(r) \ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ + r++) \ if (r->capable) #define for_each_enabled_rdt_resource(r) \ @@ -165,8 +218,16 @@ union cpuid_0x10_1_eax { unsigned int full; }; -/* CPUID.(EAX=10H, ECX=ResID=1).EDX */ -union cpuid_0x10_1_edx { +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { struct { unsigned int cos_max:16; } split; @@ -175,7 +236,7 @@ union cpuid_0x10_1_edx { DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); -void rdt_cbm_update(void *arg); +void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 4fb1d0abef95..81d3d8776fd9 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -3,6 +3,9 @@ #include <linux/notifier.h> +#define IPCMSG_INDIRECT_READ 0x02 +#define IPCMSG_INDIRECT_WRITE 0x05 + #define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ #define IPCMSG_WARM_RESET 0xF0 @@ -45,7 +48,10 @@ int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); /* Issue commands to the SCU with or without data */ int intel_scu_ipc_simple_command(int cmd, int sub); int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen); + u32 *out, int outlen); +int intel_scu_ipc_raw_command(int cmd, int sub, u8 *in, int inlen, + u32 *out, int outlen, u32 dptr, u32 sptr); + /* I2C control api */ int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data); diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index b41ee164930a..c313cac36f56 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -5,6 +5,8 @@ #ifndef IOSF_MBI_SYMS_H #define IOSF_MBI_SYMS_H +#include <linux/notifier.h> + #define MBI_MCR_OFFSET 0xD0 #define MBI_MDR_OFFSET 0xD4 #define MBI_MCRX_OFFSET 0xD8 @@ -47,6 +49,10 @@ #define QRK_MBI_UNIT_MM 0x05 #define QRK_MBI_UNIT_SOC 0x31 +/* Action values for the pmic_bus_access_notifier functions */ +#define MBI_PMIC_BUS_ACCESS_BEGIN 1 +#define MBI_PMIC_BUS_ACCESS_END 2 + #if IS_ENABLED(CONFIG_IOSF_MBI) bool iosf_mbi_available(void); @@ -88,6 +94,65 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); */ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); +/** + * iosf_mbi_punit_acquire() - Acquire access to the P-Unit + * + * One some systems the P-Unit accesses the PMIC to change various voltages + * through the same bus as other kernel drivers use for e.g. battery monitoring. + * + * If a driver sends requests to the P-Unit which require the P-Unit to access + * the PMIC bus while another driver is also accessing the PMIC bus various bad + * things happen. + * + * To avoid these problems this function must be called before accessing the + * P-Unit or the PMIC, be it through iosf_mbi* functions or through other means. + * + * Note on these systems the i2c-bus driver will request a sempahore from the + * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing + * it, but this does not appear to be sufficient, we still need to avoid making + * certain P-Unit requests during the access window to avoid problems. + * + * This function locks a mutex, as such it may sleep. + */ +void iosf_mbi_punit_acquire(void); + +/** + * iosf_mbi_punit_release() - Release access to the P-Unit + */ +void iosf_mbi_punit_release(void); + +/** + * iosf_mbi_register_pmic_bus_access_notifier - Register PMIC bus notifier + * + * This function can be used by drivers which may need to acquire P-Unit + * managed resources from interrupt context, where iosf_mbi_punit_acquire() + * can not be used. + * + * This function allows a driver to register a notifier to get notified (in a + * process context) before other drivers start accessing the PMIC bus. + * + * This allows the driver to acquire any resources, which it may need during + * the window the other driver is accessing the PMIC, before hand. + * + * @nb: notifier_block to register + */ +int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb); + +/** + * iosf_mbi_register_pmic_bus_access_notifier - Unregister PMIC bus notifier + * + * @nb: notifier_block to unregister + */ +int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb); + +/** + * iosf_mbi_call_pmic_bus_access_notifier_chain - Call PMIC bus notifier chain + * + * @val: action to pass into listener's notifier_call function + * @v: data pointer to pass into listener's notifier_call function + */ +int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v); + #else /* CONFIG_IOSF_MBI is not enabled */ static inline bool iosf_mbi_available(void) @@ -115,6 +180,28 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) WARN(1, "IOSF_MBI driver not available"); return -EPERM; } + +static inline void iosf_mbi_punit_acquire(void) {} +static inline void iosf_mbi_punit_release(void) {} + +static inline +int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline +int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline +int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v) +{ + return 0; +} + #endif /* CONFIG_IOSF_MBI */ #endif /* IOSF_MBI_SYMS_H */ diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 1410b567ecde..f527b02a0ee3 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -11,9 +11,12 @@ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT */ #define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ - (0xffff800000000000ULL >> 3)) -/* 47 bits for kernel address -> (47 - 3) bits for shadow */ -#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) + ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3)) +/* + * 47 bits for kernel address -> (47 - 3) bits for shadow + * 56 bits for kernel address -> (56 - 3) bits for shadow + */ +#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3))) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 282630e4c6ea..70ef205489f0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -164,6 +164,7 @@ struct kimage_arch { }; #else struct kimage_arch { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 200581691c6e..34b984c60790 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -72,14 +72,13 @@ struct arch_specific_insn { /* copy of the original instruction */ kprobe_opcode_t *insn; /* - * boostable = -1: This instruction type is not boostable. - * boostable = 0: This instruction type is boostable. - * boostable = 1: This instruction has been boosted: we have + * boostable = false: This instruction type is not boostable. + * boostable = true: This instruction has been boosted: we have * added a relative jump after the instruction copy in insn, * so no single-step and fixup are needed (unless there's * a post_handler or break_handler). */ - int boostable; + bool boostable; bool if_modifier; }; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 3e8c287090e4..055962615779 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -221,6 +221,9 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); + + unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -290,7 +293,6 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74ef58c8ff53..9c761fea0c98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,8 +43,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 #define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS @@ -63,10 +61,10 @@ #define KVM_REQ_PMI 19 #define KVM_REQ_SMI 20 #define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS 22 -#define KVM_REQ_SCAN_IOAPIC 23 +#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH 26 #define KVM_REQ_IOAPIC_EOI_EXIT 27 #define KVM_REQ_HV_RESET 28 @@ -343,9 +341,10 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - int root_level; - int shadow_root_level; union kvm_mmu_page_role base_role; + u8 root_level; + u8 shadow_root_level; + u8 ept_ad; bool direct_map; /* @@ -612,6 +611,8 @@ struct kvm_vcpu_arch { unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; unsigned long guest_debug_dr7; + u64 msr_platform_info; + u64 msr_misc_features_enables; u64 mcg_cap; u64 mcg_status; @@ -1019,6 +1020,8 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); + int (*write_log_dirty)(struct kvm_vcpu *vcpu); + /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index d74747b031ec..c4eda791f877 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -46,6 +46,7 @@ struct kvm_page_track_notifier_node { }; void kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index e63873683d4a..4fd5195deed0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -128,7 +128,7 @@ * debugging tools. Each entry is only valid when its finished flag * is set. */ -struct mce_log { +struct mce_log_buffer { char signature[12]; /* "MACHINECHECK" */ unsigned len; /* = MCE_LOG_LEN */ unsigned next; @@ -191,10 +191,12 @@ extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; enum mce_notifier_prios { - MCE_PRIO_SRAO = INT_MAX, - MCE_PRIO_EXTLOG = INT_MAX - 1, - MCE_PRIO_NFIT = INT_MAX - 2, - MCE_PRIO_EDAC = INT_MAX - 3, + MCE_PRIO_FIRST = INT_MAX, + MCE_PRIO_SRAO = INT_MAX - 1, + MCE_PRIO_EXTLOG = INT_MAX - 2, + MCE_PRIO_NFIT = INT_MAX - 3, + MCE_PRIO_EDAC = INT_MAX - 4, + MCE_PRIO_MCELOG = 1, MCE_PRIO_LOWEST = 0, }; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 306c7e12af55..68b329d77b3a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -268,8 +268,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); -} #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 32007041ef8c..831eb7895535 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -64,7 +64,7 @@ static inline void find_smp_config(void) } #ifdef CONFIG_X86_MPPARSE -extern void early_reserve_e820_mpc_new(void); +extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); extern void default_smp_read_mpc_oem(struct mpc_table *mpc); @@ -76,7 +76,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); extern void default_find_smp_config(void); extern void default_get_smp_config(unsigned int early); #else -static inline void early_reserve_e820_mpc_new(void) { } +static inline void e820__memblock_alloc_reserved_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 7c9c895432a9..fba100713924 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -176,4 +176,58 @@ void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); #endif +#ifdef CONFIG_HYPERV_TSCPAGE +struct ms_hyperv_tsc_page *hv_get_tsc_page(void); +static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +{ + u64 scale, offset, cur_tsc; + u32 sequence; + + /* + * The protocol for reading Hyper-V TSC page is specified in Hypervisor + * Top-Level Functional Specification ver. 3.0 and above. To get the + * reference time we must do the following: + * - READ ReferenceTscSequence + * A special '0' value indicates the time source is unreliable and we + * need to use something else. The currently published specification + * versions (up to 4.0b) contain a mistake and wrongly claim '-1' + * instead of '0' as the special value, see commit c35b82ef0294. + * - ReferenceTime = + * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset + * - READ ReferenceTscSequence again. In case its value has changed + * since our first reading we need to discard ReferenceTime and repeat + * the whole sequence as the hypervisor was updating the page in + * between. + */ + do { + sequence = READ_ONCE(tsc_pg->tsc_sequence); + if (!sequence) + return U64_MAX; + /* + * Make sure we read sequence before we read other values from + * TSC page. + */ + smp_rmb(); + + scale = READ_ONCE(tsc_pg->tsc_scale); + offset = READ_ONCE(tsc_pg->tsc_offset); + cur_tsc = rdtsc_ordered(); + + /* + * Make sure we read sequence after we read all other values + * from TSC page. + */ + smp_rmb(); + + } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); + + return mul_u64_u64_shr(cur_tsc, scale, 64) + offset; +} + +#else +static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) +{ + return NULL; +} +#endif #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8b5f8ab8ef9..673f9ac50f6d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -45,6 +45,8 @@ #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd #define MSR_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 +#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -127,6 +129,7 @@ /* DEBUGCTLMSR bits (others vary by model): */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) @@ -552,10 +555,12 @@ #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT) -/* MISC_FEATURE_ENABLES non-architectural features */ -#define MSR_MISC_FEATURE_ENABLES 0x00000140 +/* MISC_FEATURES_ENABLES non-architectural features */ +#define MSR_MISC_FEATURES_ENABLES 0x00000140 -#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT 1 +#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0 +#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT) +#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1 #define MSR_IA32_TSC_DEADLINE 0x000006E0 diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b3bebf9e5746..b4a0d43248cf 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -4,6 +4,7 @@ #include <asm/page_64_types.h> #ifndef __ASSEMBLY__ +#include <asm/alternative.h> /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; @@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define pfn_valid(pfn) ((pfn) < max_pfn) #endif -void clear_page(void *page); +void clear_page_orig(void *page); +void clear_page_rep(void *page); +void clear_page_erms(void *page); + +static inline void clear_page(void *page) +{ + alternative_call_2(clear_page_orig, + clear_page_rep, X86_FEATURE_REP_GOOD, + clear_page_erms, X86_FEATURE_ERMS, + "=D" (page), + "0" (page) + : "memory", "rax", "rcx"); +} + void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 9215e0527647..3f5f08b010d0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -36,7 +36,12 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ +#ifdef CONFIG_X86_5LEVEL +#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) +#else #define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) +#endif + #ifdef CONFIG_RANDOMIZE_MEMORY #define __PAGE_OFFSET page_offset_base #else @@ -46,8 +51,13 @@ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +#ifdef CONFIG_X86_5LEVEL +#define __PHYSICAL_MASK_SHIFT 52 +#define __VIRTUAL_MASK_SHIFT 56 +#else #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 47 +#endif /* * Kernel image size is limited to 1GiB due to the fixmap living in the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 0489884fdc44..55fa56fe4e45 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } +static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) +{ + PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn); +} + +static inline void paravirt_release_p4d(unsigned long pfn) +{ + PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); +} + static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -536,7 +546,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, val); } -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -565,26 +575,54 @@ static inline pudval_t pud_val(pud_t pud) return ret; } -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void pud_clear(pud_t *pudp) { - pgdval_t val = native_pgd_val(pgd); + set_pud(pudp, __pud(0)); +} - if (sizeof(pgdval_t) > sizeof(long)) - PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp, +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + p4dval_t val = native_p4d_val(p4d); + + if (sizeof(p4dval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp, val, (u64)val >> 32); else - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, + PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp, val); } +#if CONFIG_PGTABLE_LEVELS >= 5 + +static inline p4d_t __p4d(p4dval_t val) +{ + p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val); + + return (p4d_t) { ret }; +} + +static inline p4dval_t p4d_val(p4d_t p4d) +{ + return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); +} + +static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + pgdval_t val = native_pgd_val(pgd); + + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val); +} + static inline void pgd_clear(pgd_t *pgdp) { set_pgd(pgdp, __pgd(0)); } -static inline void pud_clear(pud_t *pudp) +#endif /* CONFIG_PGTABLE_LEVELS == 5 */ + +static inline void p4d_clear(p4d_t *p4dp) { - set_pud(pudp, __pud(0)); + set_p4d(p4dp, __p4d(0)); } #endif /* CONFIG_PGTABLE_LEVELS == 4 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b060f962d581..7465d6fe336f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -238,9 +238,11 @@ struct pv_mmu_ops { void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn); void (*release_pte)(unsigned long pfn); void (*release_pmd)(unsigned long pfn); void (*release_pud)(unsigned long pfn); + void (*release_p4d)(unsigned long pfn); /* Pagetable manipulation functions */ void (*set_pte)(pte_t *ptep, pte_t pteval); @@ -279,12 +281,21 @@ struct pv_mmu_ops { struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); + +#if CONFIG_PGTABLE_LEVELS >= 5 + struct paravirt_callee_save p4d_val; + struct paravirt_callee_save make_p4d; + + void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval); +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ + +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ + #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1411dbed5e5e..f513cc231151 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -7,6 +7,7 @@ #include <linux/string.h> #include <linux/scatterlist.h> #include <asm/io.h> +#include <asm/pat.h> #include <asm/x86_init.h> #ifdef __KERNEL__ @@ -102,10 +103,8 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); #define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, - int write_combine); - +#define arch_can_pci_mmap_wc() pat_enabled() +#define ARCH_GENERIC_PCI_MMAP_RESOURCE #ifdef CONFIG_PCI extern void early_quirks(void); diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index d08eacd298c2..9f1b21f372fe 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -4,6 +4,8 @@ * (c) 1999 Martin Mares <mj@ucw.cz> */ +#include <linux/ioport.h> + #undef DEBUG #ifdef DEBUG diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b6d425999f99..b2d0cd8288aa 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) {} static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_release_pte(unsigned long pfn) {} static inline void paravirt_release_pmd(unsigned long pfn) {} static inline void paravirt_release_pud(unsigned long pfn) {} +static inline void paravirt_release_p4d(unsigned long pfn) {} #endif /* @@ -121,10 +123,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -150,6 +152,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, ___pud_free_tlb(tlb, pud); } +#if CONFIG_PGTABLE_LEVELS > 4 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) +{ + paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); +} + +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + return (p4d_t *)get_zeroed_page(gfp); +} + +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + free_page((unsigned long)p4d); +} + +extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); + +static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long address) +{ + ___p4d_free_tlb(tlb, p4d); +} + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 392576433e77..373ab1de909f 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -7,6 +7,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 72277b1028a5..50d35e3185f5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -121,12 +121,9 @@ static inline void native_pmd_clear(pmd_t *pmd) *(tmp + 1) = 0; } -#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \ - defined(CONFIG_PARAVIRT)) static inline void native_pud_clear(pud_t *pudp) { } -#endif static inline void pud_clear(pud_t *pudp) { diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..b8a4341faafa 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -7,6 +7,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..f5af95a0c6b8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -2,8 +2,6 @@ #define _ASM_X86_PGTABLE_H #include <asm/page.h> -#include <asm/e820.h> - #include <asm/pgtable_types.h> /* @@ -53,16 +51,24 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) -#ifndef __PAGETABLE_PUD_FOLDED +#ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) #define pgd_clear(pgd) native_pgd_clear(pgd) #endif +#ifndef set_p4d +# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d) +#endif + +#ifndef __PAGETABLE_PUD_FOLDED +#define p4d_clear(p4d) native_p4d_clear(p4d) +#endif + #ifndef set_pud # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif -#ifndef __PAGETABLE_PMD_FOLDED +#ifndef __PAGETABLE_PUD_FOLDED #define pud_clear(pud) native_pud_clear(pud) #endif @@ -74,6 +80,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) +#ifndef __PAGETABLE_P4D_FOLDED +#define p4d_val(x) native_p4d_val(x) +#define __p4d(x) native_make_p4d(x) +#endif + #ifndef __PAGETABLE_PUD_FOLDED #define pud_val(x) native_pud_val(x) #define __pud(x) native_make_pud(x) @@ -179,6 +190,17 @@ static inline unsigned long pud_pfn(pud_t pud) return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; +} + +static inline int p4d_large(p4d_t p4d) +{ + /* No 512 GiB pages yet */ + return 0; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -538,6 +560,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define pte_pgprot(x) __pgprot(pte_flags(x)) #define pmd_pgprot(x) __pgprot(pmd_flags(x)) #define pud_pgprot(x) __pgprot(pud_flags(x)) +#define p4d_pgprot(x) __pgprot(p4d_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) @@ -587,6 +610,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> +#include <asm/fixmap.h> static inline int pte_none(pte_t pte) { @@ -770,7 +794,52 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + #if CONFIG_PGTABLE_LEVELS > 3 +static inline int p4d_none(p4d_t p4d) +{ + return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + return p4d_flags(p4d) & _PAGE_PRESENT; +} + +static inline unsigned long p4d_page_vaddr(p4d_t p4d) +{ + return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define p4d_page(p4d) \ + pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) + +/* Find an entry in the third-level page table.. */ +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); +} + +static inline int p4d_bad(p4d_t p4d) +{ + return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ + +static inline unsigned long p4d_index(unsigned long address) +{ + return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + +#if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { return pgd_flags(pgd) & _PAGE_PRESENT; @@ -788,14 +857,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned long pud_index(unsigned long address) -{ - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); + return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } static inline int pgd_bad(pgd_t pgd) @@ -813,7 +877,7 @@ static inline int pgd_none(pgd_t pgd) */ return !native_pgd_val(pgd); } -#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* __ASSEMBLY__ */ @@ -845,6 +909,7 @@ static inline int pgd_none(pgd_t pgd) extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); +extern void memblock_find_dma_reserve(void); #ifdef CONFIG_X86_64 /* Realmode trampoline initialization. */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index fbc73360aea0..bfab55675c16 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -14,7 +14,6 @@ */ #ifndef __ASSEMBLY__ #include <asm/processor.h> -#include <asm/fixmap.h> #include <linux/threads.h> #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 73c7ccc38912..9991224f6238 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -35,15 +35,22 @@ extern void paging_init(void); #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) + +#if CONFIG_PGTABLE_LEVELS >= 5 +#define p4d_ERROR(e) \ + pr_err("%s:%d: bad p4d %p(%016lx)\n", \ + __FILE__, __LINE__, &(e), p4d_val(e)) +#endif + #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); - static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -121,6 +128,20 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } +static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + *p4dp = p4d; +} + +static inline void native_p4d_clear(p4d_t *p4d) +{ +#ifdef CONFIG_X86_5LEVEL + native_set_p4d(p4d, native_make_p4d(0)); +#else + native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); +#endif +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { *pgdp = pgd; diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..06470da156ba 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -13,6 +13,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; @@ -22,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t; #define SHARED_KERNEL_PMD 0 +#ifdef CONFIG_X86_5LEVEL + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 48 +#define PTRS_PER_PGD 512 + +/* + * 4th level page in 5-level paging case + */ +#define P4D_SHIFT 39 +#define PTRS_PER_P4D 512 +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE - 1)) + +#else /* CONFIG_X86_5LEVEL */ + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ #define PGDIR_SHIFT 39 #define PTRS_PER_PGD 512 +#endif /* CONFIG_X86_5LEVEL */ + /* * 3rd level page */ @@ -55,9 +76,15 @@ typedef struct { pteval_t pte; } pte_t; /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#ifdef CONFIG_X86_5LEVEL +#define VMALLOC_SIZE_TB _AC(16384, UL) +#define __VMALLOC_BASE _AC(0xff92000000000000, UL) +#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +#else #define VMALLOC_SIZE_TB _AC(32, UL) #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) #define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +#endif #ifdef CONFIG_RANDOMIZE_MEMORY #define VMALLOC_START vmalloc_base #define VMEMMAP_START vmemmap_base @@ -67,10 +94,11 @@ typedef struct { pteval_t pte; } pte_t; #endif /* CONFIG_RANDOMIZE_MEMORY */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) -#define MODULES_END _AC(0xffffffffff000000, UL) +/* The module sections ends with the start of the fixmap */ +#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8b4de22d6429..bf9638e1ee42 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -272,6 +272,27 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } +#if CONFIG_PGTABLE_LEVELS > 4 +typedef struct { p4dval_t p4d; } p4d_t; + +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { val }; +} + +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return p4d.p4d; +} +#else +#include <asm-generic/pgtable-nop4d.h> + +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return native_pgd_val(p4d.pgd); +} +#endif + #if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; @@ -289,7 +310,7 @@ static inline pudval_t native_pud_val(pud_t pud) static inline pudval_t native_pud_val(pud_t pud) { - return native_pgd_val(pud.pgd); + return native_pgd_val(pud.p4d.pgd); } #endif @@ -310,10 +331,26 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) static inline pmdval_t native_pmd_val(pmd_t pmd) { - return native_pgd_val(pmd.pud.pgd); + return native_pgd_val(pmd.pud.p4d.pgd); } #endif +static inline p4dval_t p4d_pfn_mask(p4d_t p4d) +{ + /* No 512 GiB huge pages yet */ + return PTE_PFN_MASK; +} + +static inline p4dval_t p4d_flags_mask(p4d_t p4d) +{ + return ~p4d_pfn_mask(p4d); +} + +static inline p4dval_t p4d_flags(p4d_t p4d) +{ + return native_p4d_val(p4d) & p4d_flags_mask(p4d); +} + static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) @@ -457,6 +494,7 @@ enum pg_level { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_512G, PG_LEVEL_NUM }; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 34684adb6899..b3b09b98896d 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -46,6 +46,15 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc(). pkey 0 is special, and never + * returned from pkey_alloc(). + */ + if (pkey <= 0) + return false; + if (pkey >= arch_max_pkey()) + return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } @@ -82,12 +91,6 @@ int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { - /* - * pkey 0 is special, always allocated and can never - * be freed. - */ - if (!pkey) - return -EINVAL; if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 2c1ebeb4d737..d5a22bac9988 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -44,18 +44,14 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) BUG(); } -static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) -{ - return memcpy_mcsafe(dst, src, n); -} - /** * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. */ static inline void arch_wb_cache_pmem(void *addr, size_t size) { @@ -69,15 +65,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size) clwb(p); } -/* - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec - * iterators, so for other types (bvec & kvec) we must do a cache write-back. - */ -static inline bool __iter_needs_pmem_wb(struct iov_iter *i) -{ - return iter_is_iovec(i) == false; -} - /** * arch_copy_from_iter_pmem - copy data from an iterator to PMEM * @addr: PMEM destination address @@ -94,7 +81,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, /* TODO: skip the write-back by always using non-temporal stores */ len = copy_from_iter_nocache(addr, bytes, i); - if (__iter_needs_pmem_wb(i)) + /* + * In the iovec case on x86_64 copy_from_iter_nocache() uses + * non-temporal stores for the bulk of the transfer, but we need + * to manually flush if the transfer is unaligned. A cached + * memory copy is used when destination or size is not naturally + * aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + * + * In the non-iovec case the entire destination needs to be + * flushed. + */ + if (iter_is_iovec(i)) { + unsigned long flushed, dest = (unsigned long) addr; + + if (bytes < 8) { + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) + arch_wb_cache_pmem(addr, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + arch_wb_cache_pmem(addr, 1); + } + + flushed = dest - (unsigned long) addr; + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) + arch_wb_cache_pmem(addr + bytes - 1, 1); + } + } else arch_wb_cache_pmem(addr, bytes); return len; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f385eca5407a..3cada998a402 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -80,7 +80,7 @@ extern u16 __read_mostly tlb_lld_1g[NR_INFO]; /* * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head.S, so think twice + * Members of this structure are referenced in head_32.S, so think twice * before touching them. [mj] */ @@ -89,14 +89,7 @@ struct cpuinfo_x86 { __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; __u8 x86_mask; -#ifdef CONFIG_X86_32 - char wp_works_ok; /* It doesn't on 386's */ - - /* Problems on some 486Dx4's and old 386's: */ - char rfu; - char pad0; - char pad1; -#else +#ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; #endif @@ -716,6 +709,8 @@ extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); +extern void load_direct_gdt(int); +extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); @@ -797,6 +792,7 @@ static inline void spin_lock_prefetch(const void *x) /* * User space process size: 3GB (default). */ +#define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET #define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE @@ -873,7 +869,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) #define KSTK_EIP(task) (task_pt_regs(task)->ip) @@ -884,6 +881,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +DECLARE_PER_CPU(u64, msr_misc_features_shadow); + /* Register/unregister a process' MPX related resource */ #define MPX_ENABLE_MANAGEMENT() mpx_enable_management() #define MPX_DISABLE_MANAGEMENT() mpx_disable_management() diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 9b9b30b19441..8d3964fc5f91 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -9,6 +9,7 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif #ifdef CONFIG_X86_32 @@ -30,6 +31,7 @@ void x86_report_nx(void); extern int reboot_force; -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h new file mode 100644 index 000000000000..d7da2729903d --- /dev/null +++ b/arch/x86/include/asm/purgatory.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_PURGATORY_H +#define _ASM_X86_PURGATORY_H + +#ifndef __ASSEMBLY__ +#include <linux/purgatory.h> + +extern void purgatory(void); +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern unsigned long purgatory_backup_dest; +extern unsigned long purgatory_backup_src; +extern unsigned long purgatory_backup_sz; +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 2cb1cc253d51..fc62ba8dce93 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -15,6 +15,7 @@ struct machine_ops { }; extern struct machine_ops machine_ops; +extern int crashing_cpu; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fac9a5c0abe9..d91ba04dd007 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -53,6 +53,12 @@ # define NEED_MOVBE 0 #endif +#ifdef CONFIG_X86_5LEVEL +# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) +#else +# define NEED_LA57 0 +#endif + #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -98,7 +104,7 @@ #define REQUIRED_MASK13 0 #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 +#define REQUIRED_MASK16 (NEED_LA57) #define REQUIRED_MASK17 0 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h new file mode 100644 index 000000000000..eaec6c364e42 --- /dev/null +++ b/arch/x86/include/asm/set_memory.h @@ -0,0 +1,87 @@ +#ifndef _ASM_X86_SET_MEMORY_H +#define _ASM_X86_SET_MEMORY_H + +#include <asm/page.h> +#include <asm-generic/set_memory.h> + +/* + * The set_memory_* API can be used to change various attributes of a virtual + * address range. The attributes include: + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXeutable, NoteXecutable + * Read/Write : ReadOnly, ReadWrite + * Presence : NotPresent + * + * Within a category, the attributes are mutually exclusive. + * + * The implementation of this API will take care of various aspects that + * are associated with changing such attributes, such as: + * - Flushing TLBs + * - Flushing CPU caches + * - Making sure aliases of the memory behind the mapping don't violate + * coherency rules as defined by the CPU in the system. + * + * What this API does not do: + * - Provide exclusion between various callers - including callers that + * operation on other mappings of the same physical page + * - Restore default attributes when a page is freed + * - Guarantee that mappings other than the requested one are + * in any state, other than that these do not violate rules for + * the CPU you have. Do not depend on any effects on other mappings, + * CPUs other than the one you have may have more relaxed rules. + * The caller is required to take care of these. + */ + +int _set_memory_uc(unsigned long addr, int numpages); +int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); +int _set_memory_wb(unsigned long addr, int numpages); +int set_memory_uc(unsigned long addr, int numpages); +int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); +int set_memory_wb(unsigned long addr, int numpages); +int set_memory_np(unsigned long addr, int numpages); +int set_memory_4k(unsigned long addr, int numpages); + +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + +int set_pages_array_uc(struct page **pages, int addrinarray); +int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); +int set_pages_array_wb(struct page **pages, int addrinarray); + +/* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". + * These functions operate ONLY on the 1:1 kernel mapping of the + * memory that the struct page represents, and internally just + * call the set_memory_* function. See the description of the + * set_memory_* function for more details on conventions. + * + * These APIs should be considered *deprecated* and are likely going to + * be removed in the future. + * The reason for this is the implicit operation on the 1:1 mapping only, + * making this not a generally useful API. + * + * Specifically, many users of the old APIs had a virtual address, + * called virt_to_page() or vmalloc_to_page() on that address to + * get a struct page* that the old API required. + * To convert these cases, use set_memory_*() on the original + * virtual address, do not use these functions. + */ + +int set_pages_uc(struct page *page, int numpages); +int set_pages_wb(struct page *page, int numpages); +int set_pages_x(struct page *page, int numpages); +int set_pages_nx(struct page *page, int numpages); +int set_pages_ro(struct page *page, int numpages); +int set_pages_rw(struct page *page, int numpages); + +extern int kernel_set_to_readonly; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); + +#endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 026ea82ecc60..47103eca3775 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -149,6 +149,19 @@ void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) #define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu) +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() (this_cpu_read(cpu_number)) + +#ifdef CONFIG_X86_32 +extern int safe_smp_processor_id(void); +#else +# define safe_smp_processor_id() smp_processor_id() +#endif + #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) @@ -161,22 +174,6 @@ static inline int wbinvd_on_all_cpus(void) extern unsigned disabled_cpus; -#ifdef CONFIG_X86_32_SMP -/* - * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. - */ -#define raw_smp_processor_id() (this_cpu_read(cpu_number)) -extern int safe_smp_processor_id(void); - -#elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() (this_cpu_read(cpu_number)) - -#define safe_smp_processor_id() smp_processor_id() - -#endif - #ifdef CONFIG_X86_LOCAL_APIC #ifndef CONFIG_X86_64 @@ -191,11 +188,7 @@ static inline int logical_smp_processor_id(void) extern int hard_smp_processor_id(void); #else /* CONFIG_X86_LOCAL_APIC */ - -# ifndef CONFIG_SMP -# define hard_smp_processor_id() 0 -# endif - +#define hard_smp_processor_id() 0 #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_DEBUG_NMI_SELFTEST diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4517d6b93188..1f5bee2c202f 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -26,8 +26,13 @@ # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 46 +# ifdef CONFIG_X86_5LEVEL +# define MAX_PHYSADDR_BITS 52 +# define MAX_PHYSMEM_BITS 52 +# else +# define MAX_PHYSADDR_BITS 44 +# define MAX_PHYSMEM_BITS 46 +# endif #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 58505f01962f..dcbd9bcce714 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); - struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); + struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu); struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index a164862d77e3..733bae07fb29 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -79,6 +79,7 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +#define __HAVE_ARCH_MEMCPY_MCSAFE 1 __must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); DECLARE_STATIC_KEY_FALSE(mcsafe_key); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6f5eb07a95..e00e1bd6e7b3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -73,9 +73,6 @@ struct thread_info { * thread information flags * - these are process state flags that various assembly files * may need to access - * - pending work-to-be-done flags are in LSW - * - other flags in MSW - * Warning: layout of LSW is hardcoded in entry.S */ #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ @@ -87,6 +84,8 @@ struct thread_info { #define TIF_SECCOMP 8 /* secure computing */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ +#define TIF_PATCH_PENDING 13 /* pending live patching update */ +#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ @@ -103,13 +102,15 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) +#define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_NOHZ (1 << TIF_NOHZ) @@ -133,12 +134,14 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ - ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ + _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ + _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) @@ -168,9 +171,9 @@ static inline unsigned long current_stack_pointer(void) * entirely contained by a single stack frame. * * Returns: - * 1 if within a frame - * -1 if placed across a frame boundary (or outside stack) - * 0 unable to determine (no frame pointers, etc) + * GOOD_FRAME if within a frame + * BAD_STACK if placed across a frame boundary (or outside stack) + * NOT_STACK unable to determine (no frame pointers, etc) */ static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, @@ -197,13 +200,14 @@ static inline int arch_within_stack_frames(const void * const stack, * the copy as invalid. */ if (obj + len <= frame) - return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + return obj >= oldframe + 2 * sizeof(void *) ? + GOOD_FRAME : BAD_STACK; oldframe = frame; frame = *(const void * const *)frame; } - return -1; + return BAD_STACK; #else - return 0; + return NOT_STACK; #endif } @@ -239,6 +243,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void arch_task_cache_init(void); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern void arch_release_task_struct(struct task_struct *tsk); +extern void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index a04eabd43d06..27e9f9d769b8 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -12,6 +12,8 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; +extern bool using_native_sched_clock(void); + /* * We use the full linear equation: f(x) = a + b*x, in order to allow * a continuous function in the face of dynamic freq changes. diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6fa85944af83..6ed9ea469b48 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask) } } +static inline void cr4_toggle_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + cr4 ^= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Read the CR4 shadow. */ static inline unsigned long cr4_read_shadow(void) { @@ -188,7 +198,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (static_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); @@ -205,7 +215,6 @@ static inline void __flush_tlb_one(unsigned long addr) /* * TLB flushing: * - * - flush_tlb() flushes the current mm struct TLBs * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page @@ -237,11 +246,6 @@ static inline void flush_tlb_all(void) __flush_tlb_all(); } -static inline void flush_tlb(void) -{ - __flush_tlb_up(); -} - static inline void local_flush_tlb(void) { __flush_tlb_up(); @@ -303,14 +307,11 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define flush_tlb() flush_tlb_current_task() - void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ea148313570f..68766b276d9e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -3,19 +3,14 @@ /* * User space memory access functions */ -#include <linux/errno.h> #include <linux/compiler.h> #include <linux/kasan-checks.h> -#include <linux/thread_info.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - /* * The fs value determines whether argument validity checking should be * performed or not. If get_fs() == USER_DS, checking is performed, with @@ -384,6 +379,18 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret) \ + asm volatile("\n" \ + "1: mov"itype" %2,%"rtype"1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: mov %3,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r" (err), ltype(x) \ + : "m" (__m(addr)), "i" (errret), "0" (err)) + /* * This doesn't do __uaccess_begin/end - the exception handling * around it must do that. @@ -675,59 +682,6 @@ extern struct movsl_mask { # include <asm/uaccess_64.h> #endif -unsigned long __must_check _copy_from_user(void *to, const void __user *from, - unsigned n); -unsigned long __must_check _copy_to_user(void __user *to, const void *from, - unsigned n); - -extern void __compiletime_error("usercopy buffer size is too small") -__bad_copy_user(void); - -static inline void copy_user_overflow(int size, unsigned long count) -{ - WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); -} - -static __always_inline unsigned long __must_check -copy_from_user(void *to, const void __user *from, unsigned long n) -{ - int sz = __compiletime_object_size(to); - - might_fault(); - - kasan_check_write(to, n); - - if (likely(sz < 0 || sz >= n)) { - check_object_size(to, n, false); - n = _copy_from_user(to, from, n); - } else if (!__builtin_constant_p(n)) - copy_user_overflow(sz, n); - else - __bad_copy_user(); - - return n; -} - -static __always_inline unsigned long __must_check -copy_to_user(void __user *to, const void *from, unsigned long n) -{ - int sz = __compiletime_object_size(from); - - kasan_check_read(from, n); - - might_fault(); - - if (likely(sz < 0 || sz >= n)) { - check_object_size(from, n, true); - n = _copy_to_user(to, from, n); - } else if (!__builtin_constant_p(n)) - copy_user_overflow(sz, n); - else - __bad_copy_user(); - - return n; -} - /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 7d3bdd1ed697..aeda9bb8af50 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -4,149 +4,52 @@ /* * User space memory access functions */ -#include <linux/errno.h> -#include <linux/thread_info.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> -unsigned long __must_check __copy_to_user_ll - (void __user *to, const void *from, unsigned long n); -unsigned long __must_check __copy_from_user_ll - (void *to, const void __user *from, unsigned long n); -unsigned long __must_check __copy_from_user_ll_nozero - (void *to, const void __user *from, unsigned long n); -unsigned long __must_check __copy_from_user_ll_nocache - (void *to, const void __user *from, unsigned long n); +unsigned long __must_check __copy_user_ll + (void *to, const void *from, unsigned long n); unsigned long __must_check __copy_from_user_ll_nocache_nozero (void *to, const void __user *from, unsigned long n); -/** - * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. - * - * Copy data from kernel space to user space. Caller must check - * the specified block with access_ok() before calling this function. - * The caller should also make sure he pins the user space address - * so that we don't result in page fault and sleep. - */ -static __always_inline unsigned long __must_check -__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) -{ - check_object_size(from, n, true); - return __copy_to_user_ll(to, from, n); -} - -/** - * __copy_to_user: - Copy a block of data into user space, with less checking. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. Caller must check - * the specified block with access_ok() before calling this function. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ static __always_inline unsigned long __must_check -__copy_to_user(void __user *to, const void *from, unsigned long n) +raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - might_fault(); - return __copy_to_user_inatomic(to, from, n); + return __copy_user_ll((__force void *)to, from, n); } static __always_inline unsigned long -__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) -{ - return __copy_from_user_ll_nozero(to, from, n); -} - -/** - * __copy_from_user: - Copy a block of data from user space, with less checking. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. Caller must check - * the specified block with access_ok() before calling this function. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - * - * An alternate version - __copy_from_user_inatomic() - may be called from - * atomic context and will fail rather than sleep. In this case the - * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h - * for explanation of why this is needed. - */ -static __always_inline unsigned long -__copy_from_user(void *to, const void __user *from, unsigned long n) -{ - might_fault(); - check_object_size(to, n, false); - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } - return __copy_from_user_ll(to, from, n); -} - -static __always_inline unsigned long __copy_from_user_nocache(void *to, - const void __user *from, unsigned long n) +raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; switch (n) { case 1: + ret = 0; __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); + __get_user_asm_nozero(*(u8 *)to, from, ret, + "b", "b", "=q", 1); __uaccess_end(); return ret; case 2: + ret = 0; __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); + __get_user_asm_nozero(*(u16 *)to, from, ret, + "w", "w", "=r", 2); __uaccess_end(); return ret; case 4: + ret = 0; __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); + __get_user_asm_nozero(*(u32 *)to, from, ret, + "l", "k", "=r", 4); __uaccess_end(); return ret; } } - return __copy_from_user_ll_nocache(to, from, n); + return __copy_user_ll(to, (__force const void *)from, n); } static __always_inline unsigned long diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 673059a109fe..c5504b9a472e 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -5,7 +5,6 @@ * User space memory access functions */ #include <linux/compiler.h> -#include <linux/errno.h> #include <linux/lockdep.h> #include <linux/kasan-checks.h> #include <asm/alternative.h> @@ -46,58 +45,54 @@ copy_user_generic(void *to, const void *from, unsigned len) return ret; } -__must_check unsigned long -copy_in_user(void __user *to, const void __user *from, unsigned len); - -static __always_inline __must_check -int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) +static __always_inline __must_check unsigned long +raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { int ret = 0; - check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { case 1: __uaccess_begin(); - __get_user_asm(*(u8 *)dst, (u8 __user *)src, + __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); __uaccess_end(); return ret; case 2: __uaccess_begin(); - __get_user_asm(*(u16 *)dst, (u16 __user *)src, + __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); __uaccess_end(); return ret; case 4: __uaccess_begin(); - __get_user_asm(*(u32 *)dst, (u32 __user *)src, + __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); __uaccess_end(); return ret; case 8: __uaccess_begin(); - __get_user_asm(*(u64 *)dst, (u64 __user *)src, + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); __uaccess_end(); return ret; case 10: __uaccess_begin(); - __get_user_asm(*(u64 *)dst, (u64 __user *)src, + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); if (likely(!ret)) - __get_user_asm(*(u16 *)(8 + (char *)dst), + __get_user_asm_nozero(*(u16 *)(8 + (char *)dst), (u16 __user *)(8 + (char __user *)src), ret, "w", "w", "=r", 2); __uaccess_end(); return ret; case 16: __uaccess_begin(); - __get_user_asm(*(u64 *)dst, (u64 __user *)src, + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); if (likely(!ret)) - __get_user_asm(*(u64 *)(8 + (char *)dst), + __get_user_asm_nozero(*(u64 *)(8 + (char *)dst), (u64 __user *)(8 + (char __user *)src), ret, "q", "", "=r", 8); __uaccess_end(); @@ -107,20 +102,11 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) } } -static __always_inline __must_check -int __copy_from_user(void *dst, const void __user *src, unsigned size) -{ - might_fault(); - kasan_check_write(dst, size); - return __copy_from_user_nocheck(dst, src, size); -} - -static __always_inline __must_check -int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) +static __always_inline __must_check unsigned long +raw_copy_to_user(void __user *dst, const void *src, unsigned long size) { int ret = 0; - check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -176,100 +162,16 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) } static __always_inline __must_check -int __copy_to_user(void __user *dst, const void *src, unsigned size) -{ - might_fault(); - kasan_check_read(src, size); - return __copy_to_user_nocheck(dst, src, size); -} - -static __always_inline __must_check -int __copy_in_user(void __user *dst, const void __user *src, unsigned size) -{ - int ret = 0; - - might_fault(); - if (!__builtin_constant_p(size)) - return copy_user_generic((__force void *)dst, - (__force void *)src, size); - switch (size) { - case 1: { - u8 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u8 __user *)src, - ret, "b", "b", "=q", 1); - if (likely(!ret)) - __put_user_asm(tmp, (u8 __user *)dst, - ret, "b", "b", "iq", 1); - __uaccess_end(); - return ret; - } - case 2: { - u16 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u16 __user *)src, - ret, "w", "w", "=r", 2); - if (likely(!ret)) - __put_user_asm(tmp, (u16 __user *)dst, - ret, "w", "w", "ir", 2); - __uaccess_end(); - return ret; - } - - case 4: { - u32 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u32 __user *)src, - ret, "l", "k", "=r", 4); - if (likely(!ret)) - __put_user_asm(tmp, (u32 __user *)dst, - ret, "l", "k", "ir", 4); - __uaccess_end(); - return ret; - } - case 8: { - u64 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u64 __user *)src, - ret, "q", "", "=r", 8); - if (likely(!ret)) - __put_user_asm(tmp, (u64 __user *)dst, - ret, "q", "", "er", 8); - __uaccess_end(); - return ret; - } - default: - return copy_user_generic((__force void *)dst, - (__force void *)src, size); - } -} - -static __must_check __always_inline int -__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) -{ - kasan_check_write(dst, size); - return __copy_from_user_nocheck(dst, src, size); -} - -static __must_check __always_inline int -__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) +unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) { - kasan_check_read(src, size); - return __copy_to_user_nocheck(dst, src, size); + return copy_user_generic((__force void *)dst, + (__force void *)src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); static inline int -__copy_from_user_nocache(void *dst, const void __user *src, unsigned size) -{ - might_fault(); - kasan_check_write(dst, size); - return __copy_user_nocache(dst, src, size, 1); -} - -static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 32712a925f26..1ba1536f627e 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -23,7 +23,6 @@ # include <asm/unistd_64.h> # include <asm/unistd_64_x32.h> # define __ARCH_WANT_COMPAT_SYS_TIME -# define __ARCH_WANT_COMPAT_SYS_GETDENTS64 # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 6fa75b17aec3..e6676495b125 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -11,9 +11,12 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; + bool error; #ifdef CONFIG_FRAME_POINTER + bool got_irq; unsigned long *bp, *orig_sp; struct pt_regs *regs; + unsigned long ip; #else unsigned long *sp; #endif @@ -40,6 +43,11 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } +static inline bool unwind_error(struct unwind_state *state) +{ + return state->error; +} + #ifdef CONFIG_FRAME_POINTER static inline diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 57ab86d94d64..7cac79802ad2 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -185,6 +185,15 @@ #define MSG_REGULAR 1 #define MSG_RETRY 2 +#define BAU_DESC_QUALIFIER 0x534749 + +enum uv_bau_version { + UV_BAU_V1 = 1, + UV_BAU_V2, + UV_BAU_V3, + UV_BAU_V4, +}; + /* * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) * If the 'multilevel' flag in the header portion of the descriptor @@ -222,20 +231,32 @@ struct bau_local_cpumask { * the s/w ack bit vector ] */ -/* - * The payload is software-defined for INTD transactions +/** + * struct uv1_2_3_bau_msg_payload - defines payload for INTD transactions + * @address: Signifies a page or all TLB's of the cpu + * @sending_cpu: CPU from which the message originates + * @acknowledge_count: CPUs on the destination Hub that received the interrupt */ -struct bau_msg_payload { - unsigned long address; /* signifies a page or all - TLB's of the cpu */ - /* 64 bits */ - unsigned short sending_cpu; /* filled in by sender */ - /* 16 bits */ - unsigned short acknowledge_count; /* filled in by destination */ - /* 16 bits */ - unsigned int reserved1:32; /* not usable */ +struct uv1_2_3_bau_msg_payload { + u64 address; + u16 sending_cpu; + u16 acknowledge_count; }; +/** + * struct uv4_bau_msg_payload - defines payload for INTD transactions + * @address: Signifies a page or all TLB's of the cpu + * @sending_cpu: CPU from which the message originates + * @acknowledge_count: CPUs on the destination Hub that received the interrupt + * @qualifier: Set by source to verify origin of INTD broadcast + */ +struct uv4_bau_msg_payload { + u64 address; + u16 sending_cpu; + u16 acknowledge_count; + u32 reserved:8; + u32 qualifier:24; +}; /* * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) @@ -385,17 +406,6 @@ struct uv2_3_bau_msg_header { /* bits 127:120 */ }; -/* Abstracted BAU functions */ -struct bau_operations { - unsigned long (*read_l_sw_ack)(void); - unsigned long (*read_g_sw_ack)(int pnode); - unsigned long (*bau_gpa_to_offset)(unsigned long vaddr); - void (*write_l_sw_ack)(unsigned long mmr); - void (*write_g_sw_ack)(int pnode, unsigned long mmr); - void (*write_payload_first)(int pnode, unsigned long mmr); - void (*write_payload_last)(int pnode, unsigned long mmr); -}; - /* * The activation descriptor: * The format of the message to send, plus all accompanying control @@ -411,7 +421,10 @@ struct bau_desc { struct uv2_3_bau_msg_header uv2_3_hdr; } header; - struct bau_msg_payload payload; + union bau_payload_header { + struct uv1_2_3_bau_msg_payload uv1_2_3; + struct uv4_bau_msg_payload uv4; + } payload; }; /* UV1: * -payload-- ---------header------ @@ -588,8 +601,12 @@ struct uvhub_desc { struct socket_desc socket[2]; }; -/* - * one per-cpu; to locate the software tables +/** + * struct bau_control + * @status_mmr: location of status mmr, determined by uvhub_cpu + * @status_index: index of ERR|BUSY bits in status mmr, determined by uvhub_cpu + * + * Per-cpu control struct containing CPU topology information and BAU tuneables. */ struct bau_control { struct bau_desc *descriptor_base; @@ -607,6 +624,8 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; + u64 status_mmr; + int status_index; bool nobau; short baudisabled; short cpu; @@ -644,6 +663,19 @@ struct bau_control { struct hub_and_pnode *thp; }; +/* Abstracted BAU functions */ +struct bau_operations { + unsigned long (*read_l_sw_ack)(void); + unsigned long (*read_g_sw_ack)(int pnode); + unsigned long (*bau_gpa_to_offset)(unsigned long vaddr); + void (*write_l_sw_ack)(unsigned long mmr); + void (*write_g_sw_ack)(int pnode, unsigned long mmr); + void (*write_payload_first)(int pnode, unsigned long mmr); + void (*write_payload_last)(int pnode, unsigned long mmr); + int (*wait_completion)(struct bau_desc*, + struct bau_control*, long try); +}; + static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 72e8300b1e8a..9cffb44a3cf5 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -485,15 +485,17 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; - paddr |= uv_hub_info->gnode_upper; - if (m_val) + + if (m_val) { + paddr |= uv_hub_info->gnode_upper; paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); - else + } else { paddr |= uv_soc_phys_ram_to_nasid(paddr) << uv_hub_info->gpa_shift; + } return paddr; } diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 2444189cbe28..bccdf4938ddf 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -20,6 +20,7 @@ struct vdso_image { long sym_vvar_page; long sym_hpet_page; long sym_pvclock_page; + long sym_hvclock_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cc54b7026567..35cd06f636ab 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,8 +70,10 @@ #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_RDRAND 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_RDSEED 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 @@ -516,12 +518,14 @@ struct vmx_msr_entry { #define EPT_VIOLATION_READABLE_BIT 3 #define EPT_VIOLATION_WRITABLE_BIT 4 #define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 #define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) #define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) #define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) #define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) #define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) #define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) +#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* * VM-instruction error numbers diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 33cbd3db97b9..8417ef7c3885 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -6,6 +6,7 @@ #include <linux/spinlock.h> #include <linux/pfn.h> #include <linux/mm.h> +#include <linux/device.h> #include <linux/uaccess.h> #include <asm/page.h> @@ -51,12 +52,30 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); +#ifdef CONFIG_XEN_PV extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count); +#else +static inline int +set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + return 0; +} + +static inline int +clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, + struct page **pages, unsigned int count) +{ + return 0; +} +#endif /* * Helper functions to write or read unsigned long values to/from @@ -72,6 +91,7 @@ static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val) return __get_user(*val, (unsigned long __user *)addr); } +#ifdef CONFIG_XEN_PV /* * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator @@ -98,6 +118,12 @@ static inline unsigned long __pfn_to_mfn(unsigned long pfn) return mfn; } +#else +static inline unsigned long __pfn_to_mfn(unsigned long pfn) +{ + return pfn; +} +#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { @@ -279,13 +305,17 @@ static inline pte_t __pte_ma(pteval_t x) #define pmd_val_ma(v) ((v).pmd) #ifdef __PAGETABLE_PUD_FOLDED -#define pud_val_ma(v) ((v).pgd.pgd) +#define pud_val_ma(v) ((v).p4d.pgd.pgd) #else #define pud_val_ma(v) ((v).pud) #endif #define __pmd_ma(x) ((pmd_t) { (x) } ) -#define pgd_val_ma(x) ((x).pgd) +#ifdef __PAGETABLE_P4D_FOLDED +#define p4d_val_ma(x) ((x).pgd.pgd) +#else +#define p4d_val_ma(x) ((x).p4d) +#endif void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 5138dacf8bb8..ddef37b16af2 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -34,7 +34,6 @@ #include <linux/screen_info.h> #include <linux/apm_bios.h> #include <linux/edd.h> -#include <asm/e820.h> #include <asm/ist.h> #include <video/edid.h> @@ -58,7 +57,7 @@ struct setup_header { __u32 header; __u16 version; __u32 realmode_swtch; - __u16 start_sys; + __u16 start_sys_seg; __u16 kernel_version; __u8 type_of_loader; __u8 loadflags; @@ -111,6 +110,21 @@ struct efi_info { __u32 efi_memmap_hi; }; +/* + * This is the maximum number of entries in struct boot_params::e820_table + * (the zeropage), which is part of the x86 boot protocol ABI: + */ +#define E820_MAX_ENTRIES_ZEROPAGE 128 + +/* + * The E820 memory region entry of the boot protocol ABI: + */ +struct boot_e820_entry { + __u64 addr; + __u64 size; + __u32 type; +} __attribute__((packed)); + /* The so-called "zeropage" */ struct boot_params { struct screen_info screen_info; /* 0x000 */ @@ -153,7 +167,7 @@ struct boot_params { struct setup_header hdr; /* setup header */ /* 0x1f1 */ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ - struct e820entry e820_map[E820MAX]; /* 0x2d0 */ + struct boot_e820_entry e820_table[E820_MAX_ENTRIES_ZEROPAGE]; /* 0x2d0 */ __u8 _pad8[48]; /* 0xcd0 */ struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ __u8 _pad9[276]; /* 0xeec */ diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 3a20ccf787b8..432df4b1baec 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -124,7 +124,7 @@ * Recommend using hypercall for address space switches rather * than MOV to CR3 instruction */ -#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0) /* Recommend using hypercall for local TLB flushes rather * than INVLPG or MOV to CR3 instructions */ #define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) @@ -148,6 +148,11 @@ #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) /* + * Virtual APIC support + */ +#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) + +/* * Crash notification flag. */ #define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 739c0c594022..c2824d02ba37 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,9 @@ #include <linux/types.h> #include <linux/ioctl.h> +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + #define DE_VECTOR 0 #define DB_VECTOR 1 #define BP_VECTOR 3 diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 835aa51c7f6e..c45765517092 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -1,10 +1,13 @@ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 14458658e988..690a2dcf4078 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -76,7 +76,11 @@ #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 @@ -90,6 +94,7 @@ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVLPG, "INVLPG" }, \ { EXIT_REASON_RDPMC, "RDPMC" }, \ { EXIT_REASON_RDTSC, "RDTSC" }, \ @@ -108,6 +113,8 @@ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ @@ -115,20 +122,24 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ - { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_RDTSCP, "RDTSCP" }, \ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_XSETBV, "XSETBV" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ - { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ - { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ - { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ - { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVVPID, "INVVPID" }, \ + { EXIT_REASON_RDRAND, "RDRAND" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_VMFUNC, "VMFUNC" }, \ + { EXIT_REASON_ENCLS, "ENCLS" }, \ + { EXIT_REASON_RDSEED, "RDSEED" }, \ + { EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 84c00592d359..4b994232cb57 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -27,7 +27,7 @@ KASAN_SANITIZE_stacktrace.o := n OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y # If instrumentation of this dir is enabled, boot hangs during first second. @@ -46,7 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o -obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o +obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o @@ -82,6 +82,7 @@ obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ae32838cac5f..6bb680671088 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,6 +37,7 @@ #include <linux/pci.h> #include <linux/efi-bgrt.h> +#include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> #include <asm/pgtable.h> @@ -179,10 +180,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -710,7 +716,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -1559,12 +1565,6 @@ int __init early_acpi_boot_init(void) return 0; } -static int __init acpi_parse_bgrt(struct acpi_table_header *table) -{ - efi_bgrt_init(table); - return 0; -} - int __init acpi_boot_init(void) { /* those are executed after early-quirks are executed */ @@ -1724,6 +1724,6 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820_add_region(addr, size, E820_ACPI); - update_e820(); + e820__range_add(addr, size, E820_TYPE_ACPI); + e820__update_table_print(); } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 48587335ede8..ed014814ea35 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void) #ifdef CONFIG_SMP initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = - (unsigned long)get_cpu_gdt_table(smp_processor_id()); + (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index df083efe6ee0..815dd63f49d0 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -36,7 +36,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 0a2bb1f62e72..ef2859f9fcce 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,7 +21,7 @@ #include <linux/pci.h> #include <linux/bitops.h> #include <linux/suspend.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -306,13 +306,13 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (e820_any_mapped(aper_base, aper_base + aper_size, - E820_RAM)) { + if (e820__mapped_any(aper_base, aper_base + aper_size, + E820_TYPE_RAM)) { /* reserve it, so we can reuse it in second kernel */ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", aper_base, aper_base + aper_size - 1); - e820_add_region(aper_base, aper_size, E820_RESERVED); - update_e820(); + e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED); + e820__update_table_print(); } } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4261b3282ad9..2d75faf743f2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -731,8 +731,10 @@ static int __init calibrate_APIC_clock(void) TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -778,8 +780,10 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; @@ -1610,24 +1614,15 @@ static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } #endif /* !CONFIG_X86_X2APIC */ -static int __init try_to_enable_IR(void) -{ -#ifdef CONFIG_X86_IO_APIC - if (!x2apic_enabled() && skip_ioapic_setup) { - pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); - return -1; - } -#endif - return irq_remapping_enable(); -} - void __init enable_IR_x2apic(void) { unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) + if (skip_ioapic_setup) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; + } ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) @@ -1645,7 +1640,7 @@ void __init enable_IR_x2apic(void) /* If irq_remapping_prepare() succeeded, try to enable it */ if (ir_stat >= 0) - ir_stat = try_to_enable_IR(); + ir_stat = irq_remapping_enable(); /* ir_stat contains the remap mode or an error code */ try_to_enable_x2apic(ir_stat); @@ -1798,8 +1793,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() + * If the system has ACPI MADT tables or MP info, the LAPIC + * address is already registered. */ if (!acpi_lapic && !smp_found_config) register_lapic_address(apic_phys); @@ -2062,17 +2057,17 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "Only %d processors supported." + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids - 1, nr_logical_cpuids, apicid); - return -1; + nr_cpu_ids, nr_logical_cpuids, apicid); + return -EINVAL; } cpuid_to_apicid[nr_logical_cpuids] = apicid; return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2130,11 +2125,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2186,23 +2179,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); @@ -2258,7 +2241,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); + apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); #else /* * Hack: In case of kdump, after a crash, kernel might be booting @@ -2648,7 +2631,7 @@ static int __init lapic_insert_resource(void) } /* - * need call insert after e820_reserve_resources() + * need call insert after e820__reserve_resources() * that is using request_resource */ late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b109e4389c92..2262eb6df796 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -26,7 +26,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index c48264e202fd..2e8f7f048f4f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -25,7 +25,7 @@ #include <linux/interrupt.h> #include <asm/acpi.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e9f8f8cdd570..b487b3a01615 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -34,6 +34,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv.h> #include <asm/apic.h> +#include <asm/e820/api.h> #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> @@ -1105,7 +1106,8 @@ void __init uv_init_hub_info(struct uv_hub_info_s *hi) node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; - hi->gnode_upper = (unsigned long)hi->gnode_extra << mn.m_val; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; if (uv_gp_table) { hi->global_mmr_base = uv_gp_table->mmr_base; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5a414545e8a3..446b0d3d4932 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -2352,7 +2352,7 @@ static int __init apm_init(void) * Note we only set APM segments on CPU zero, since we pin the APM * code to that CPU. */ - gdt = get_cpu_gdt_table(0); + gdt = get_cpu_gdt_rw(0); set_desc_base(&gdt[APM_CS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); set_desc_base(&gdt[APM_CS_16 >> 3], diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 35a5d5dca2fa..ee8f11800295 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/cacheflush.h> +# include <asm/set_memory.h> #endif #include "cpu.h" @@ -556,10 +556,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (check_tsc_unstable()) - clear_sched_clock_stable(); - } else { - clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a44ef52184df..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,7 +17,7 @@ #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> void __init check_bugs(void) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index adc0ebd8bed0..44207b71fee1 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -3,7 +3,7 @@ #include <linux/sched/clock.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -105,8 +105,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif - - clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b11b38c3b0bd..c8b39870f33e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -88,7 +88,6 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif - clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -449,19 +448,60 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +/* Setup the fixmap mapping only once per-processor */ +static inline void setup_fixmap_gdt(int cpu) +{ +#ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the GDT + * is read-only, that will triple fault. + * + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ + pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; +#endif + + __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); +} + +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_fixmap_gdt); + /* * Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ void switch_to_new_gdt(int cpu) { - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(cpu); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); + /* Load the original GDT */ + load_direct_gdt(cpu); /* Reload the per-cpu base */ - load_percpu_segment(cpu); } @@ -1077,8 +1117,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); - else - clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1111,7 +1149,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif - init_hypervisor(c); x86_init_rdrand(c); x86_init_cache_qos(c); setup_pku(c); @@ -1529,6 +1566,9 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #else @@ -1584,6 +1624,9 @@ void cpu_init(void) dbg_restore_debug_regs(); fpu__init_cpu(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 0a3bc19de017..a70fd61095f8 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -185,7 +185,6 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } - clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 35691a6b0d32..4fa90006ac68 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,8 +28,11 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN - &x86_hyper_xen, +#ifdef CONFIG_XEN_PV + &x86_hyper_xen_pv, +#endif +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, @@ -60,12 +63,6 @@ detect_hypervisor_vendor(void) pr_info("Hypervisor detected: %s\n", x86_hyper->name); } -void init_hypervisor(struct cpuinfo_x86 *c) -{ - if (x86_hyper && x86_hyper->set_cpu_features) - x86_hyper->set_cpu_features(c); -} - void __init init_hypervisor_platform(void) { @@ -74,8 +71,6 @@ void __init init_hypervisor_platform(void) if (!x86_hyper) return; - init_hypervisor(&boot_cpu_data); - if (x86_hyper->init_platform) x86_hyper->init_platform(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe0a615a051b..dfa90a3a5145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) return; } - if (ring3mwait_disabled) { - msr_clear_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + if (ring3mwait_disabled) return; - } - - msr_set_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); if (c == &boot_cpu_data) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; @@ -162,10 +158,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (check_tsc_unstable()) - clear_sched_clock_stable(); - } else { - clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -492,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c) init_intel_energy_perf(c); } +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -606,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); - probe_xeon_phi_r3mwait(c); + init_intel_misc_features(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5a533fefefa0..5b366462f579 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -32,55 +32,98 @@ #include <asm/intel-family.h> #include <asm/intel_rdt.h> +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 + /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { { - .name = "L3", - .domains = domain_init(RDT_RESOURCE_L3), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3DATA", - .domains = domain_init(RDT_RESOURCE_L3DATA), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 0 + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L3CODE", - .domains = domain_init(RDT_RESOURCE_L3CODE), - .msr_base = IA32_L3_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 3, - .cbm_idx_multi = 2, - .cbm_idx_offset = 1 + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", }, { - .name = "L2", - .domains = domain_init(RDT_RESOURCE_L2), - .msr_base = IA32_L2_CBM_BASE, - .min_cbm_bits = 1, - .cache_level = 2, - .cbm_idx_multi = 1, - .cbm_idx_offset = 0 + .name = "MB", + .domains = domain_init(RDT_RESOURCE_MBA), + .msr_base = IA32_MBA_THRTL_BASE, + .msr_update = mba_wrmsr, + .cache_level = 3, + .parse_ctrlval = parse_bw, + .format_str = "%d=%*d", }, }; -static int cbm_idx(struct rdt_resource *r, int closid) +static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) { - return closid * r->cbm_idx_multi + r->cbm_idx_offset; + return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; } /* @@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void) return false; r->num_closid = 4; - r->cbm_len = 20; - r->max_cbm = max_cbm; - r->min_cbm_bits = 2; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.min_cbm_bits = 2; r->capable = true; r->enabled = true; @@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void) return false; } -static void rdt_get_config(int idx, struct rdt_resource *r) +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool rdt_get_mem_config(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->membw.max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; + r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + } + r->data_width = 3; + rdt_get_mba_infofile(r); + + r->capable = true; + r->enabled = true; + + return true; +} + +static void rdt_get_cache_config(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; - union cpuid_0x10_1_edx edx; + union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->cbm_len = eax.split.cbm_len + 1; - r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->data_width = (r->cache.cbm_len + 3) / 4; + rdt_get_cache_infofile(r); r->capable = true; r->enabled = true; } @@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type) struct rdt_resource *r = &rdt_resources_all[type]; r->num_closid = r_l3->num_closid / 2; - r->cbm_len = r_l3->cbm_len; - r->max_cbm = r_l3->max_cbm; + r->cache.cbm_len = r_l3->cache.cbm_len; + r->default_ctrl = r_l3->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; r->capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter @@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type) r->enabled = false; } -static inline bool get_rdt_resources(void) -{ - bool ret = false; - - if (cache_alloc_hsw_probe()) - return true; - - if (!boot_cpu_has(X86_FEATURE_RDT_A)) - return false; - - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } - ret = true; - } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { - /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); - ret = true; - } - - return ret; -} - static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level) return -1; } -void rdt_cbm_update(void *arg) +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - struct msr_param *m = (struct msr_param *)arg; + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; struct rdt_resource *r = m->res; - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) - goto found; + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + r->msr_update(d, m, r); + return; + } } - pr_info_once("cpu %d not found in any domain for resource %s\n", + pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); - - return; - -found: - for (i = m->low; i < m->high; i++) { - int idx = cbm_idx(r, i); - - wrmsrl(r->msr_base + idx, d->cbm[i]); - } } /* @@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct msr_param m; + u32 *dc; + int i; + + dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + if (!dc) + return -ENOMEM; + + d->ctrl_val = dc; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100 + */ + for (i = 0; i < r->num_closid; i++, dc++) + *dc = r->default_ctrl; + + m.low = 0; + m.high = r->num_closid; + r->msr_update(d, &m, r); + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int i, id = get_cache_id(cpu, r->cache_level); + int id = get_cache_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; @@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; - d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); - if (!d->cbm) { + if (domain_setup_ctrlval(r, d)) { kfree(d); return; } - for (i = 0; i < r->num_closid; i++) { - int idx = cbm_idx(r, i); - - d->cbm[i] = r->max_cbm; - wrmsrl(r->msr_base + idx, d->cbm[i]); - } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); - r->num_domains++; } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - r->num_domains--; - kfree(d->cbm); + kfree(d->ctrl_val); list_del(&d->list); kfree(d); } @@ -374,6 +483,57 @@ static int intel_rdt_offline_cpu(unsigned int cpu) return 0; } +/* + * Choose a width for the resource name and resource data based on the + * resource that has widest name and cbm. + */ +static __init void rdt_init_padding(void) +{ + struct rdt_resource *r; + int cl; + + for_each_capable_rdt_resource(r) { + cl = strlen(r->name); + if (cl > max_name_width) + max_name_width = cl; + + if (r->data_width > max_data_width) + max_data_width = r->data_width; + } +} + +static __init bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) + ret = true; + } + + return ret; +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -382,6 +542,8 @@ static int __init intel_rdt_late_init(void) if (!get_rdt_resources()) return -ENODEV; + rdt_init_padding(); + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rdt/cat:online:", intel_rdt_online_cpu, intel_rdt_offline_cpu); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 0bbe0f3a039f..f5af0cc7eb0d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -28,7 +28,6 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/slab.h> -#include <linux/cpu.h> #include <linux/task_work.h> #include <uapi/linux/magic.h> @@ -175,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + static int rdtgroup_cpus_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -183,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); - else + if (rdtgrp) { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } else { ret = -ENOENT; + } rdtgroup_kn_unlock(of->kn); return ret; @@ -253,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - ret = cpumask_parse(buf, newmask); + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + if (ret) goto unlock; @@ -474,6 +486,14 @@ static struct rftype rdtgroup_base_files[] = { .seq_show = rdtgroup_cpus_show, }, { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + }, + { .name = "tasks", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -495,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of, struct rdt_resource *r = of->kn->parent->priv; seq_printf(seq, "%d\n", r->num_closid); + return 0; +} +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->default_ctrl); return 0; } -static int rdt_cbm_mask_show(struct kernfs_open_file *of, +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%x\n", r->max_cbm); + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%u\n", r->membw.min_bw); return 0; } -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, +static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - seq_printf(seq, "%d\n", r->min_cbm_bits); + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->membw.delay_linear); return 0; } /* rdtgroup information files for one cache resource. */ -static struct rftype res_info_files[] = { +static struct rftype res_cache_info_files[] = { { .name = "num_closids", .mode = 0444, @@ -531,7 +575,7 @@ static struct rftype res_info_files[] = { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_cbm_mask_show, + .seq_show = rdt_default_ctrl_show, }, { .name = "min_cbm_bits", @@ -541,11 +585,52 @@ static struct rftype res_info_files[] = { }, }; +/* rdtgroup information files for memory bandwidth. */ +static struct rftype res_mba_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + }, +}; + +void rdt_get_mba_infofile(struct rdt_resource *r) +{ + r->info_files = res_mba_info_files; + r->nr_info_files = ARRAY_SIZE(res_mba_info_files); +} + +void rdt_get_cache_infofile(struct rdt_resource *r) +{ + r->info_files = res_cache_info_files; + r->nr_info_files = ARRAY_SIZE(res_cache_info_files); +} + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { struct kernfs_node *kn_subdir; + struct rftype *res_info_files; struct rdt_resource *r; - int ret; + int ret, len; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -564,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) ret = rdtgroup_kn_set_ugid(kn_subdir); if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn_subdir, res_info_files, - ARRAY_SIZE(res_info_files)); + + res_info_files = r->info_files; + len = r->nr_info_files; + + ret = rdtgroup_add_files(kn_subdir, res_info_files, len); if (ret) goto out_destroy; kernfs_activate(kn_subdir); @@ -728,7 +816,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { kernfs_unbreak_active_protection(kn); - kernfs_put(kn); + kernfs_put(rdtgrp->kn); kfree(rdtgrp); } else { kernfs_unbreak_active_protection(kn); @@ -781,7 +869,7 @@ out: return dentry; } -static int reset_all_cbms(struct rdt_resource *r) +static int reset_all_ctrls(struct rdt_resource *r) { struct msr_param msr_param; cpumask_var_t cpu_mask; @@ -804,14 +892,14 @@ static int reset_all_cbms(struct rdt_resource *r) cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); for (i = 0; i < r->num_closid; i++) - d->cbm[i] = r->max_cbm; + d->ctrl_val[i] = r->default_ctrl; } cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); free_cpumask_var(cpu_mask); @@ -897,7 +985,7 @@ static void rdt_kill_sb(struct super_block *sb) /*Put everything back to default values. */ for_each_enabled_rdt_resource(r) - reset_all_cbms(r); + reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); static_branch_disable(&rdt_enable_key); diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c index f369cb8db0d5..406d7a6532f9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -29,26 +29,77 @@ #include <asm/intel_rdt.h> /* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear) + return false; + + ret = kstrtoul(buf, 10, &bw); + if (ret) + return false; + + if (bw < r->membw.min_bw || bw > r->default_ctrl) + return false; + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +{ + unsigned long data; + + if (d->have_new_ctrl) + return -EINVAL; + + if (!bw_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; + + return 0; +} + +/* * Check whether a cache bit mask is valid. The SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. */ -static bool cbm_validate(unsigned long var, struct rdt_resource *r) +static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) { - unsigned long first_bit, zero_bit; + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) + return false; - if (var == 0 || var > r->max_cbm) + if (val == 0 || val > r->default_ctrl) return false; - first_bit = find_first_bit(&var, r->cbm_len); - zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) return false; - if ((zero_bit - first_bit) < r->min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) return false; + + *data = val; return true; } @@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r) * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -static int parse_cbm(char *buf, struct rdt_resource *r) +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - int ret; - ret = kstrtoul(buf, 16, &data); - if (ret) - return ret; - if (!cbm_validate(data, r)) + if (d->have_new_ctrl) return -EINVAL; - r->tmp_cbms[r->num_tmp_cbms++] = data; + + if(!cbm_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; return 0; } @@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r) /* * For each domain in this resource we expect to find a series of: * id=mask - * separated by ";". The "id" is in decimal, and must appear in the - * right order. + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. */ static int parse_line(char *line, struct rdt_resource *r) { @@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r) struct rdt_domain *d; unsigned long dom_id; +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) + return -EINVAL; + dom = strim(dom); list_for_each_entry(d, &r->domains, list) { - dom = strsep(&line, ";"); - if (!dom) - return -EINVAL; - id = strsep(&dom, "="); - if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) - return -EINVAL; - if (parse_cbm(dom, r)) - return -EINVAL; + if (d->id == dom_id) { + if (r->parse_ctrlval(dom, r, d)) + return -EINVAL; + goto next; + } } - - /* Any garbage at the end of the line? */ - if (line && line[0]) - return -EINVAL; - return 0; + return -EINVAL; } static int update_domains(struct rdt_resource *r, int closid) @@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; - int cpu, idx = 0; + int cpu; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid) msr_param.res = r; list_for_each_entry(d, &r->domains, list) { - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->ctrl_val[closid] = d->new_ctrl; + } } + if (cpumask_empty(cpu_mask)) + goto done; cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_cbm_update(&msr_param); + rdt_ctrl_update(&msr_param); /* Update CBM on other cpus. */ - smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); +done: free_cpumask_var(cpu_mask); return 0; } +static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +{ + struct rdt_resource *r; + + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && closid < r->num_closid) + return parse_line(tok, r); + } + return -EINVAL; +} + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; int closid, ret = 0; - u32 *l3_cbms = NULL; /* Valid input requires a trailing newline */ if (nbytes == 0 || buf[nbytes - 1] != '\n') @@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - /* get scratch space to save all the masks while we validate input */ for_each_enabled_rdt_resource(r) { - r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), - GFP_KERNEL); - if (!r->tmp_cbms) { - ret = -ENOMEM; - goto out; - } - r->num_tmp_cbms = 0; + list_for_each_entry(dom, &r->domains, list) + dom->have_new_ctrl = false; } while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strsep(&tok, ":"); + resname = strim(strsep(&tok, ":")); if (!tok) { ret = -EINVAL; goto out; } - for_each_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && - closid < r->num_closid) { - ret = parse_line(tok, r); - if (ret) - goto out; - break; - } - } - if (!r->name) { - ret = -EINVAL; - goto out; - } - } - - /* Did the parser find all the masks we need? */ - for_each_enabled_rdt_resource(r) { - if (r->num_tmp_cbms != r->num_domains) { - ret = -EINVAL; + ret = rdtgroup_parse_resource(resname, tok, closid); + if (ret) goto out; - } } for_each_enabled_rdt_resource(r) { @@ -201,10 +245,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); - for_each_enabled_rdt_resource(r) { - kfree(r->tmp_cbms); - r->tmp_cbms = NULL; - } return ret ?: nbytes; } @@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) struct rdt_domain *dom; bool sep = false; - seq_printf(s, "%s:", r->name); + seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(dom, &r->domains, list) { if (sep) seq_puts(s, ";"); - seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + seq_printf(s, r->format_str, dom->id, max_data_width, + dom->ctrl_val[closid]); sep = true; } seq_puts(s, "\n"); diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index a3311c886194..43051f0777d4 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c new file mode 100644 index 000000000000..9c632cb88546 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -0,0 +1,397 @@ +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/poll.h> + +#include "mce-internal.h" + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +#define mce_log_get_idx_check(p) \ +({ \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ + smp_load_acquire(&(p)); \ +}) + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +/* User mode helper program triggered by machine check event */ +extern char mce_helper[128]; + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int next, entry; + + wmb(); + for (;;) { + entry = mce_log_get_idx_check(mcelog.next); + for (;;) { + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return NOTIFY_OK; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + cpu_tsc[smp_processor_id()] = rdtsc(); +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + next = mce_log_get_idx_check(mcelog.next); + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(m, 0, sizeof(*m)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); + smp_rmb(); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); + } + } + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + kfree(cpu_tsc); + + return err ? err : buf - ubuf; +} + +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog.next)) + return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int err; + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; + } + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 1e5a50c11d3c..217cd4449bc9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 903043e6a62b..654ad0668d72 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,7 +13,7 @@ enum severity_level { MCE_PANIC_SEVERITY, }; -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern struct blocking_notifier_head x86_mce_decoder_chain; #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) m1->addr != m2->addr || m1->misc != m2->misc; } + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +extern void mce_work_trigger(void); +#else +static inline void mce_work_trigger(void) { } +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8e9725c607ea..5abd4bf73d6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -35,6 +35,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/cpu.h> +#include <linux/ras.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> @@ -49,18 +50,11 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/reboot.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_chrdev_read_mutex); - -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) +static DEFINE_MUTEX(mce_log_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -85,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -/* User mode helper program triggered by machine check event */ -static unsigned long mce_need_notify; -static char mce_helper[128]; -static char *mce_helper_argv[2] = { mce_helper, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); - static DEFINE_PER_CPU(struct mce, mces_seen); -static int cpu_missing; +static unsigned long mce_need_notify; +static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -121,7 +109,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -143,82 +131,38 @@ void mce_setup(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; - -void mce_log(struct mce *mce) +void mce_log(struct mce *m) { - unsigned next, entry; - - /* Emit the trace record: */ - trace_mce_record(mce); - - if (!mce_gen_pool_add(mce)) + if (!mce_gen_pool_add(m)) irq_work_queue(&mce_irq_work); - - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, &mce_need_notify); } void mce_inject_log(struct mce *m) { - mutex_lock(&mce_chrdev_read_mutex); + mutex_lock(&mce_log_mutex); mce_log(m); - mutex_unlock(&mce_chrdev_read_mutex); + mutex_unlock(&mce_log_mutex); } EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +/* + * We run the default notifier if we have only the SRAO, the first and the + * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS + * notifiers registered on the chain. + */ +#define NUM_DEFAULT_NOTIFIERS 3 static atomic_t num_notifiers; void mce_register_decode_chain(struct notifier_block *nb) { - atomic_inc(&num_notifiers); + if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + return; - WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); + atomic_inc(&num_notifiers); - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -226,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb) { atomic_dec(&num_notifiers); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -319,18 +263,7 @@ static void __print_mce(struct mce *m) static void print_mce(struct mce *m) { - int ret = 0; - __print_mce(m); - - /* - * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that) - */ - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - if (ret == NOTIFY_STOP) - return; - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -519,7 +452,6 @@ static void mce_schedule_work(void) static void mce_irq_work_cb(struct irq_work *entry) { - mce_notify_irq(); mce_schedule_work(); } @@ -548,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs) */ static int mce_usable_address(struct mce *m) { - if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + if (!(m->status & MCI_STATUS_ADDRV)) return 0; /* Checks after this one are Intel-specific: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 1; + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; + return 1; } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + return (xec == 0x0 || xec == 0x8); + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + +static bool cec_add_mce(struct mce *m) +{ + if (!m) + return false; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (memory_error(m) && + !(m->status & MCI_STATUS_UC) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return true; + + return false; +} + +static int mce_first_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (cec_add_mce(m)) + return NOTIFY_STOP; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block first_nb = { + .notifier_call = mce_first_notifier, + .priority = MCE_PRIO_FIRST, +}; + static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -591,11 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - /* - * Run the default notifier if we have only the SRAO - * notifier and us registered. - */ - if (atomic_read(&num_notifiers) > 2) + if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) return NOTIFY_DONE; __print_mce(m); @@ -648,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; - - return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * Intel SDM Volume 3B - 15.9.2 Compound Error Codes - * - * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for - * indicating a memory error. Bit 8 is used for indicating a - * cache hierarchy error. The combination of bit 2 and bit 3 - * is used for indicating a `generic' cache hierarchy error - * But we can't just blindly check the above bits, because if - * bit 11 is set, then it is a bus/interconnect error - and - * either way the above bits just gives more detail on what - * bus/interconnect error happened. Note that bit 12 can be - * ignored, as it's the "filter" bit. - */ - return (m->status & 0xef80) == BIT(7) || - (m->status & 0xef00) == BIT(8) || - (m->status & 0xeffc) == 0xc; - } - - return false; -} - DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -1127,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) * on Intel. */ int lmce = 1; + int cpu = smp_processor_id(); - /* If this CPU is offline, just bail out. */ - if (cpu_is_offline(smp_processor_id())) { + /* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); @@ -1399,13 +1386,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI @@ -1417,11 +1397,7 @@ int mce_notify_irq(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); if (test_and_clear_bit(0, &mce_need_notify)) { - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - - if (mce_helper[0]) - schedule_work(&mce_trigger_work); + mce_work_trigger(); if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); @@ -1688,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { + if (c->x86_vendor == X86_VENDOR_AMD) { mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - /* - * Install proper ops for Scalable MCA enabled processors - */ if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; msr_ops.status = smca_status_reg; msr_ops.addr = smca_addr_reg; msr_ops.misc = smca_misc_reg; } - mce_amd_feature_init(c); + } +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); break; } @@ -1798,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; + __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); @@ -1823,252 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c) } -/* - * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_chrdev_state_lock); -static int mce_chrdev_open_count; /* #times opened */ -static int mce_chrdev_open_exclu; /* already open exclusive? */ - -static int mce_chrdev_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - if (mce_chrdev_open_exclu || - (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_chrdev_state_lock); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - mce_chrdev_open_exclu = 1; - mce_chrdev_open_count++; - - spin_unlock(&mce_chrdev_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_chrdev_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - mce_chrdev_open_count--; - mce_chrdev_open_exclu = 0; - - spin_unlock(&mce_chrdev_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - -static int mce_apei_read_done; - -/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ -static int __mce_read_apei(char __user **ubuf, size_t usize) -{ - int rc; - u64 record_id; - struct mce m; - - if (usize < sizeof(struct mce)) - return -EINVAL; - - rc = apei_read_mce(&m, &record_id); - /* Error or no more MCE record */ - if (rc <= 0) { - mce_apei_read_done = 1; - /* - * When ERST is disabled, mce_chrdev_read() should return - * "no record" instead of "no device." - */ - if (rc == -ENODEV) - return 0; - return rc; - } - rc = -EFAULT; - if (copy_to_user(*ubuf, &m, sizeof(struct mce))) - return rc; - /* - * In fact, we should have cleared the record after that has - * been flushed to the disk or sent to network in - * /sbin/mcelog, but we have no interface to support that now, - * so just clear it to avoid duplication. - */ - rc = apei_clear_mce(record_id); - if (rc) { - mce_apei_read_done = 1; - return rc; - } - *ubuf += sizeof(struct mce); - - return 0; -} - -static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, - size_t usize, loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_chrdev_read_mutex); - - if (!mce_apei_read_done) { - err = __mce_read_apei(&buf, usize); - if (err || buf != ubuf) - goto out; - } - - next = mce_log_get_idx_check(mcelog.next); - - /* Only supports full reads right now */ - err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) - goto out; - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } - } - - if (err) - err = -EFAULT; - -out: - mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); - - return err ? err : buf - ubuf; -} - -static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) - return POLLIN | POLLRDNORM; - if (!mce_apei_read_done && apei_check_mce()) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, - unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); - -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) -{ - mce_write = fn; -} -EXPORT_SYMBOL_GPL(register_mce_write_callback); - -static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) -{ - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else - return -EINVAL; -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_chrdev_open, - .release = mce_chrdev_release, - .read = mce_chrdev_read, - .write = mce_chrdev_write, - .poll = mce_chrdev_poll, - .unlocked_ioctl = mce_chrdev_ioctl, - .llseek = no_llseek, -}; - -static struct miscdevice mce_chrdev_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - static void __mce_disable_bank(void *arg) { int bank = *((int *)arg); @@ -2142,6 +1878,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&first_nb); mce_register_decode_chain(&mce_srao_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); @@ -2286,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return size; } -static ssize_t -show_trigger(struct device *s, struct device_attribute *attr, char *buf) -{ - strcpy(buf, mce_helper); - strcat(buf, "\n"); - return strlen(mce_helper) + 1; -} - -static ssize_t set_trigger(struct device *s, struct device_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; - p = strchr(mce_helper, '\n'); - - if (p) - *p = 0; - - return strlen(mce_helper) + !!p; -} - static ssize_t set_ignore_ce(struct device *s, struct device_attribute *attr, const char *buf, size_t size) @@ -2365,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); @@ -2388,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, +#endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, @@ -2562,7 +2277,6 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { - enum cpuhp_state hp_online; int err; if (!mce_available(&boot_cpu_data)) { @@ -2590,21 +2304,11 @@ static __init int mcheck_init_device(void) mce_cpu_online, mce_cpu_pre_down); if (err < 0) goto err_out_online; - hp_online = err; register_syscore_ops(&mce_syscore_ops); - /* register character device /dev/mcelog */ - err = misc_register(&mce_chrdev_device); - if (err) - goto err_register; - return 0; -err_register: - unregister_syscore_ops(&mce_syscore_ops); - cpuhp_remove_state(hp_online); - err_out_online: cpuhp_remove_state(CPUHP_X86_MCE_DEAD); @@ -2612,7 +2316,7 @@ err_out_mem: free_cpumask_var(mce_device_initialized); err_out: - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + pr_err("Unable to init MCE device (rc: %d)\n", err); return err; } @@ -2691,6 +2395,7 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); + cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 524cc5780a77..6e4a047e4b68 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -60,7 +60,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 190b3e6cef4d..e84db79ef272 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) return; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index b5375b9497b3..04cb8d34ccb8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -49,6 +49,9 @@ void hyperv_vector_handler(struct pt_regs *regs) if (vmbus_handler) vmbus_handler(); + if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + ack_APIC_irq(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 3b442b64c72d..765afd599039 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -27,7 +27,7 @@ #include <linux/range.h> #include <asm/processor.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -860,7 +860,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); + return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED); } /** @@ -978,7 +978,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) WARN_ON(1); pr_info("update e820 for mtrr\n"); - update_e820(); + e820__update_table_print(); return 1; } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 24e87e74990d..2bce84d91c2b 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -48,7 +48,7 @@ #include <linux/syscore_ops.h> #include <asm/cpufeature.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 18ca99f2798b..6df621ae62a7 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -31,14 +31,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" - "wp\t\t: %s\n", + "wp\t\t: yes\n", static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - c->cpuid_level, - c->wp_works_ok ? "yes" : "no"); + c->cpuid_level); } #else static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d9794060fe22..23c23508c012 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 8457b4978668..d77d07ab310b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -16,8 +16,6 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } - - clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 891f4dad7b2c..40ed26852ebd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,7 +30,6 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> -#include <asm/timer.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt @@ -114,6 +113,24 @@ static void __init vmware_paravirt_ops_setup(void) #define vmware_paravirt_ops_setup() do {} while (0) #endif +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void __init vmware_set_capabilities(void) +{ + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +} + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -153,6 +170,8 @@ static void __init vmware_platform_setup(void) #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif + + vmware_set_capabilities(); } /* @@ -177,24 +196,6 @@ static uint32_t __init vmware_platform(void) return 0; } -/* - * VMware hypervisor takes care of exporting a reliable TSC to the guest. - * Still, due to timing difference when running on virtual cpus, the TSC can - * be marked as unstable in some cases. For example, the TSC sync check at - * bootup can fail due to a marginal offset between vcpus' TSCs (though the - * TSCs do not drift from each other). Also, the ACPI PM timer clocksource - * is not suitable as a watchdog when running on a hypervisor because the - * kernel may miss a wrap of the counter if the vcpu is descheduled for a - * long time. To skip these checks at runtime we set these capability bits, - * so that the kernel could just trust the hypervisor with providing a - * reliable virtual TSC that is suitable for timekeeping. - */ -static void vmware_set_cpu_features(struct cpuinfo_x86 *c) -{ - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); -} - /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ static bool __init vmware_legacy_x2apic_available(void) { @@ -207,7 +208,6 @@ static bool __init vmware_legacy_x2apic_available(void) const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .set_cpu_features = vmware_set_cpu_features, .init_platform = vmware_platform_setup, .x2apic_available = vmware_legacy_x2apic_available, }; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 3741461c63a0..22217ece26c8 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,6 +29,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/e820/types.h> #include <asm/io_apic.h> #include <asm/hpet.h> #include <linux/kdebug.h> @@ -503,16 +504,16 @@ static int prepare_elf_headers(struct kimage *image, void **addr, return ret; } -static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) { unsigned int nr_e820_entries; nr_e820_entries = params->e820_entries; - if (nr_e820_entries >= E820MAX) + if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_map[nr_e820_entries], entry, - sizeof(struct e820entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, + sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -521,7 +522,7 @@ static int memmap_entry_callback(u64 start, u64 end, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; - struct e820entry ei; + struct e820_entry ei; ei.addr = start; ei.size = end - start + 1; @@ -560,7 +561,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { int i, ret = 0; unsigned long flags; - struct e820entry ei; + struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; @@ -574,17 +575,17 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add first 640K segment */ ei.addr = image->arch.backup_src_start; ei.size = image->arch.backup_src_sz; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); /* Add ACPI tables */ - cmd.type = E820_ACPI; + cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ - cmd.type = E820_NVS; + cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); @@ -592,7 +593,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (crashk_low_res.end) { ei.addr = crashk_low_res.start; ei.size = crashk_low_res.end - crashk_low_res.start + 1; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } @@ -609,7 +610,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) if (ei.size < PAGE_SIZE) continue; ei.addr = cmem->ranges[i].start; - ei.type = E820_RAM; + ei.type = E820_TYPE_RAM; add_e820_entry(params, &ei); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 09d4ac0d2661..dbce3cca94cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -77,7 +77,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (regs = NULL; stack; stack = stack_info.next_sp) { + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; /* @@ -289,9 +289,6 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) - report_bug(regs->ip, regs); - if (__die(str, regs, err)) sig = 0; oops_end(flags, regs, sig); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b0b3a3df7c20..e5f0b40e66d2 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -162,15 +162,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a8b117e93b46..3e1471d57487 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -178,13 +178,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b2bbad6ebe4d..d78a586ba8dc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1,49 +1,55 @@ /* - * Handle the memory map. - * The functions here do the job until bootmem takes over. + * Low level x86 E820 memory map handling functions. * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * The firmware and bootloader passes us the "E820 table", which is the primary + * physical memory layout description available about x86 systems. * + * The kernel takes the E820 memory layout and optionally modifies it with + * quirks and other tweaks, and feeds that into the generic Linux memory + * allocation code routines via a platform independent interface (memblock, etc.). */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> #include <linux/crash_dump.h> -#include <linux/export.h> #include <linux/bootmem.h> -#include <linux/pfn.h> #include <linux/suspend.h> #include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/memblock.h> #include <linux/sort.h> -#include <asm/e820.h> -#include <asm/proto.h> +#include <asm/e820/api.h> #include <asm/setup.h> -#include <asm/cpufeature.h> /* - * The e820 map is the map that gets modified e.g. with command line parameters - * and that is also registered with modifications in the kernel resource tree - * with the iomem_resource as parent. + * We organize the E820 table into two main data structures: * - * The e820_saved is directly saved after the BIOS-provided memory map is - * copied. It doesn't get modified afterwards. It's registered for the - * /sys/firmware/memmap interface. + * - 'e820_table_firmware': the original firmware version passed to us by the + * bootloader - not modified by the kernel. We use this to: * - * That memory map is not modified and is used as base for kexec. The kexec'd - * kernel should get the same memory map as the firmware provides. Then the - * user can e.g. boot the original kernel with mem=1G while still booting the - * next kernel with full memory. + * - inform the user about the firmware's notion of memory layout + * via /sys/firmware/memmap + * + * - the hibernation code uses it to generate a kernel-independent MD5 + * fingerprint of the physical memory layout of a system. + * + * - kexec, which is a bootloader in disguise, uses the original E820 + * layout to pass to the kexec-ed kernel. This way the original kernel + * can have a restricted E820 map while the kexec()-ed kexec-kernel + * can have access to full memory - etc. + * + * - 'e820_table': this is the main E820 table that is massaged by the + * low level x86 platform code, or modified by boot parameters, before + * passed on to higher level MM layers. + * + * Once the E820 map has been converted to the standard Linux memory layout + * information its role stops - modifying it has no effect and does not get + * re-propagated. So itsmain role is a temporary bootstrap storage of firmware + * specific memory layout data during early bootup. */ -static struct e820map initial_e820 __initdata; -static struct e820map initial_e820_saved __initdata; -struct e820map *e820 __refdata = &initial_e820; -struct e820map *e820_saved __refdata = &initial_e820_saved; +static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_firmware_init __initdata; + +struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -55,51 +61,53 @@ EXPORT_SYMBOL(pci_mem_start); * This function checks if any part of the range <start,end> is mapped * with type. */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - if (ei->addr >= end || ei->addr + ei->size <= start) + if (entry->addr >= end || entry->addr + entry->size <= start) continue; return 1; } return 0; } -EXPORT_SYMBOL_GPL(e820_any_mapped); +EXPORT_SYMBOL_GPL(e820__mapped_any); /* - * This function checks if the entire range <start,end> is mapped with type. + * This function checks if the entire <start,end> range is mapped with 'type'. * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case + * Note: this function only works correctly once the E820 table is sorted and + * not-overlapping (at least for the range specified), which is the case normally. */ -int __init e820_all_mapped(u64 start, u64 end, unsigned type) +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (type && ei->type != type) + if (type && entry->type != type) continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) + + /* Is the region (part) in overlap with the current region? */ + if (entry->addr >= end || entry->addr + entry->size <= start) continue; - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there + /* + * If the region is at the beginning of <start,end> we move + * 'start' to the end of the region since it's ok until there */ - if (ei->addr <= start) - start = ei->addr + ei->size; + if (entry->addr <= start) + start = entry->addr + entry->size; + /* - * if start is now at or beyond end, we're done, full - * coverage + * If 'start' is now at or beyond 'end', we're done, full + * coverage of the desired range exists: */ if (start >= end) return 1; @@ -108,94 +116,77 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) } /* - * Add a memory region to the kernel e820 map. + * Add a memory region to the kernel E820 map. */ -static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) +static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int x = e820x->nr_map; + int x = table->nr_entries; - if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", - (unsigned long long) start, - (unsigned long long) (start + size - 1)); + if (x >= ARRAY_SIZE(table->entries)) { + pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); return; } - e820x->map[x].addr = start; - e820x->map[x].size = size; - e820x->map[x].type = type; - e820x->nr_map++; + table->entries[x].addr = start; + table->entries[x].size = size; + table->entries[x].type = type; + table->nr_entries++; } -void __init e820_add_region(u64 start, u64 size, int type) +void __init e820__range_add(u64 start, u64 size, enum e820_type type) { - __e820_add_region(e820, start, size, type); + __e820__range_add(e820_table, start, size, type); } -static void __init e820_print_type(u32 type) +static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "usable"); - break; - case E820_RESERVED: - printk(KERN_CONT "reserved"); - break; - case E820_ACPI: - printk(KERN_CONT "ACPI data"); - break; - case E820_NVS: - printk(KERN_CONT "ACPI NVS"); - break; - case E820_UNUSABLE: - printk(KERN_CONT "unusable"); - break; - case E820_PMEM: - case E820_PRAM: - printk(KERN_CONT "persistent (type %u)", type); - break; - default: - printk(KERN_CONT "type %u", type); - break; + case E820_TYPE_RAM: /* Fall through: */ + case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_ACPI: pr_cont("ACPI data"); break; + case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_PMEM: /* Fall through: */ + case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; + default: pr_cont("type %u", type); break; } } -void __init e820_print_map(char *who) +void __init e820__print_table(char *who) { int i; - for (i = 0; i < e820->nr_map; i++) { - printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820->map[i].addr, - (unsigned long long) - (e820->map[i].addr + e820->map[i].size - 1)); - e820_print_type(e820->map[i].type); - printk(KERN_CONT "\n"); + for (i = 0; i < e820_table->nr_entries; i++) { + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); + + e820_print_type(e820_table->entries[i].type); + pr_cont("\n"); } } /* - * Sanitize the BIOS e820 map. + * Sanitize an E820 map. * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps, + * Some E820 layouts include overlapping entries. The following + * replaces the original E820 map with a new one, removing overlaps, * and resolving conflicting memory types in favor of highest * numbered type. * - * The input parameter biosmap points to an array of 'struct - * e820entry' which on entry has elements in the range [0, *pnr_map) - * valid, and which has space for up to max_nr_map entries. - * On return, the resulting sanitized e820 map entries will be in - * overwritten in the same location, starting at biosmap. + * The input parameter 'entries' points to an array of 'struct + * e820_entry' which on entry has elements in the range [0, *nr_entries) + * valid, and which has space for up to max_nr_entries entries. + * On return, the resulting sanitized E820 map entries will be in + * overwritten in the same location, starting at 'entries'. * - * The integer pointed to by pnr_map must be valid on entry (the - * current number of valid entries located at biosmap). If the - * sanitizing succeeds the *pnr_map will be updated with the new - * number of valid entries (something no more than max_nr_map). + * The integer pointed to by nr_entries must be valid on entry (the + * current number of valid entries located at 'entries'). If the + * sanitizing succeeds the *nr_entries will be updated with the new + * number of valid entries (something no more than max_nr_entries). * - * The return value from sanitize_e820_map() is zero if it + * The return value from e820__update_table() is zero if it * successfully 'sanitized' the map entries passed in, and is -1 * if it did nothing, which can happen if either of (1) it was * only passed one map entry, or (2) any of the input map entries @@ -238,10 +229,17 @@ void __init e820_print_map(char *who) * ______________________4_ */ struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ + /* Pointer to the original entry: */ + struct e820_entry *entry; + /* Address for this change point: */ + unsigned long long addr; }; +static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; +static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; +static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; +static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; + static int __init cpcompare(const void *a, const void *b) { struct change_member * const *app = a, * const *bpp = b; @@ -249,164 +247,140 @@ static int __init cpcompare(const void *a, const void *b) /* * Inputs are pointers to two elements of change_point[]. If their - * addresses are unequal, their difference dominates. If the addresses + * addresses are not equal, their difference dominates. If the addresses * are equal, then consider one that represents the end of its region * to be greater than one that does not. */ if (ap->addr != bp->addr) return ap->addr > bp->addr ? 1 : -1; - return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); + return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } -int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - u32 *pnr_map) +int __init e820__update_table(struct e820_table *table) { - static struct change_member change_point_list[2*E820_X_MAX] __initdata; - static struct change_member *change_point[2*E820_X_MAX] __initdata; - static struct e820entry *overlap_list[E820_X_MAX] __initdata; - static struct e820entry new_bios[E820_X_MAX] __initdata; - unsigned long current_type, last_type; + struct e820_entry *entries = table->entries; + u32 max_nr_entries = ARRAY_SIZE(table->entries); + enum e820_type current_type, last_type; unsigned long long last_addr; - int chgidx; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; + u32 new_nr_entries, overlap_entries; + u32 i, chg_idx, chg_nr; - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) + /* If there's only one memory region, don't bother: */ + if (table->nr_entries < 2) return -1; - old_nr = *pnr_map; - BUG_ON(old_nr > max_nr_map); + BUG_ON(table->nr_entries > max_nr_entries); - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + /* Bail out if we find any unreasonable addresses in the map: */ + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].addr + entries[i].size < entries[i].addr) return -1; + } - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) + /* Create pointers for initial change-point information (for sorting): */ + for (i = 0; i < 2 * table->nr_entries; i++) change_point[i] = &change_point_list[i]; - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; + /* + * Record all known change-points (starting and ending addresses), + * omitting empty memory regions: + */ + chg_idx = 0; + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].size != 0) { + change_point[chg_idx]->addr = entries[i].addr; + change_point[chg_idx++]->entry = &entries[i]; + change_point[chg_idx]->addr = entries[i].addr + entries[i].size; + change_point[chg_idx++]->entry = &entries[i]; } } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; + chg_nr = chg_idx; + + /* Sort change-point list by memory addresses (low -> high): */ + sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL); + + /* Create a new memory map, removing overlaps: */ + overlap_entries = 0; /* Number of entries in the overlap table */ + new_nr_entries = 0; /* Index for creating new map entries */ + last_type = 0; /* Start with undefined memory type */ + last_addr = 0; /* Start with 0 as last starting address */ + + /* Loop through change-points, determining effect on the new map: */ + for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) { + /* Keep track of all overlapping entries */ + if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) { + /* Add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++] = change_point[chg_idx]->entry; } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ + /* Remove entry from list (order independent, so swap with last): */ for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; + if (overlap_list[i] == change_point[chg_idx]->entry) + overlap_list[i] = overlap_list[overlap_entries-1]; } overlap_entries--; } /* - * if there are overlapping entries, decide which + * If there are overlapping entries, decide which * "type" to use (larger value takes precedence -- * 1=usable, 2,3,4,4+=unusable) */ current_type = 0; - for (i = 0; i < overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) { if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type || current_type == E820_PRAM) { + } + + /* Continue building up new map based on this information: */ + if (current_type != last_type || current_type == E820_TYPE_PRAM) { if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= max_nr_map) + new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; + /* Move forward only if the new size was non-zero: */ + if (new_entries[new_nr_entries].size != 0) + /* No more space left for new entries? */ + if (++new_nr_entries >= max_nr_entries) break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; + new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; + new_entries[new_nr_entries].type = current_type; + last_addr = change_point[chg_idx]->addr; } last_type = current_type; } } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; + /* Copy the new entries into the original location: */ + memcpy(entries, new_entries, new_nr_entries*sizeof(*entries)); + table->nr_entries = new_nr_entries; return 0; } -static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { - while (nr_map) { - u64 start = biosmap->addr; - u64 size = biosmap->size; + struct boot_e820_entry *entry = entries; + + while (nr_entries) { + u64 start = entry->addr; + u64 size = entry->size; u64 end = start + size - 1; - u32 type = biosmap->type; + u32 type = entry->type; - /* Overflow in 64 bits? Ignore the memory map. */ + /* Ignore the entry on 64-bit overflow: */ if (start > end && likely(size)) return -1; - e820_add_region(start, size, type); + e820__range_add(start, size, type); - biosmap++; - nr_map--; + entry++; + nr_entries--; } return 0; } /* - * Copy the BIOS e820 map into a safe place. + * Copy the BIOS E820 map into a safe place. * * Sanity-check it while we're at it.. * @@ -414,18 +388,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) + if (nr_entries < 2) return -1; - return __append_e820_map(biosmap, nr_map); + return __append_e820_table(entries, nr_entries); } -static u64 __init __e820_update_range(struct e820map *e820x, u64 start, - u64 size, unsigned old_type, - unsigned new_type) +static u64 __init +__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; unsigned int i; @@ -437,77 +410,73 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); e820_print_type(old_type); - printk(KERN_CONT " ==> "); + pr_cont(" ==> "); e820_print_type(new_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820x->nr_map; i++) { - struct e820entry *ei = &e820x->map[i]; + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (ei->type != old_type) + if (entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered by new range? */ - if (ei->addr >= start && ei_end <= end) { - ei->type = new_type; - real_updated_size += ei->size; + entry_end = entry->addr + entry->size; + + /* Completely covered by new range? */ + if (entry->addr >= start && entry_end <= end) { + entry->type = new_type; + real_updated_size += entry->size; continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* New range is completely covered? */ + if (entry->addr < start && entry_end > end) { + __e820__range_add(table, start, size, new_type); + __e820__range_add(table, end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_updated_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; - __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + __e820__range_add(table, final_start, final_end - final_start, new_type); real_updated_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * its size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_updated_size; } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820, start, size, old_type, new_type); + return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820_update_range_saved(u64 start, u64 size, - unsigned old_type, unsigned new_type) +static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820_update_range(e820_saved, start, size, old_type, - new_type); + return __e820__range_update(e820_table_firmware, start, size, old_type, new_type); } -/* make e820 not cover the range */ -u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, - int checktype) +/* Remove a range of memory from the E820 table: */ +u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { int i; u64 end; @@ -517,85 +486,89 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", - (unsigned long long) start, (unsigned long long) (end - 1)); - if (checktype) + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + if (check_type) e820_print_type(old_type); - printk(KERN_CONT "\n"); + pr_cont("\n"); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 final_start, final_end; - u64 ei_end; + u64 entry_end; - if (checktype && ei->type != old_type) + if (check_type && entry->type != old_type) continue; - ei_end = ei->addr + ei->size; - /* totally covered? */ - if (ei->addr >= start && ei_end <= end) { - real_removed_size += ei->size; - memset(ei, 0, sizeof(struct e820entry)); + entry_end = entry->addr + entry->size; + + /* Completely covered? */ + if (entry->addr >= start && entry_end <= end) { + real_removed_size += entry->size; + memset(entry, 0, sizeof(*entry)); continue; } - /* new range is totally covered? */ - if (ei->addr < start && ei_end > end) { - e820_add_region(end, ei_end - end, ei->type); - ei->size = start - ei->addr; + /* Is the new range completely covered? */ + if (entry->addr < start && entry_end > end) { + e820__range_add(end, entry_end - end, entry->type); + entry->size = start - entry->addr; real_removed_size += size; continue; } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(end, ei_end); + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); if (final_start >= final_end) continue; + real_removed_size += final_end - final_start; /* - * left range could be head or tail, so need to update - * size at first. + * Left range could be head or tail, so need to update + * the size first: */ - ei->size -= final_end - final_start; - if (ei->addr < final_start) + entry->size -= final_end - final_start; + if (entry->addr < final_start) continue; - ei->addr = final_end; + + entry->addr = final_end; } return real_removed_size; } -void __init update_e820(void) +void __init e820__update_table_print(void) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) + if (e820__update_table(e820_table)) return; - printk(KERN_INFO "e820: modified physical RAM map:\n"); - e820_print_map("modified"); + + pr_info("e820: modified physical RAM map:\n"); + e820__print_table("modified"); } -static void __init update_e820_saved(void) + +static void __init e820__update_table_firmware(void) { - sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), - &e820_saved->nr_map); + e820__update_table(e820_table_firmware); } + #define MAX_GAP_END 0x100000000ull + /* - * Search for a gap in the e820 memory space from 0 to MAX_GAP_END. + * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *gapstart, - unsigned long *gapsize) +static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) { unsigned long long last = MAX_GAP_END; - int i = e820->nr_map; + int i = e820_table->nr_entries; int found = 0; while (--i >= 0) { - unsigned long long start = e820->map[i].addr; - unsigned long long end = start + e820->map[i].size; + unsigned long long start = e820_table->entries[i].addr; + unsigned long long end = start + e820_table->entries[i].size; /* * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true + * fit in 32 bits if this condition is true: */ if (last > end) { unsigned long gap = last - end; @@ -613,12 +586,14 @@ static int __init e820_search_gap(unsigned long *gapstart, } /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. + * Search for the biggest gap in the low 32 bits of the E820 + * memory space. We pass this space to the PCI subsystem, so + * that it can assign MMIO resources for hotplug or + * unconfigured devices in. + * * Hopefully the BIOS let enough space left. */ -__init void e820_setup_gap(void) +__init void e820__setup_pci_gap(void) { unsigned long gapstart, gapsize; int found; @@ -629,138 +604,143 @@ __init void e820_setup_gap(void) if (!found) { #ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR - "e820: cannot find a gap in the 32bit address range\n" - "e820: PCI devices with unassigned 32bit BARs may break!\n"); + pr_err( + "e820: Cannot find an available gap in the 32-bit address range\n" + "e820: PCI devices with unassigned 32-bit BARs may not work!\n"); #else gapstart = 0x10000000; #endif } /* - * e820_reserve_resources_late protect stolen RAM already + * e820__reserve_resources_late() protects stolen RAM already: */ pci_mem_start = gapstart; - printk(KERN_INFO - "e820: [mem %#010lx-%#010lx] available for PCI devices\n", - gapstart, gapstart + gapsize - 1); + pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); } /* * Called late during init, in free_initmem(). * - * Initial e820 and e820_saved are largish __initdata arrays. - * Copy them to (usually much smaller) dynamically allocated area. - * This is done after all tweaks we ever do to them: - * all functions which modify them are __init functions, - * they won't exist after this point. + * Initial e820_table and e820_table_firmware are largish __initdata arrays. + * + * Copy them to a (usually much smaller) dynamically allocated area that is + * sized precisely after the number of e820 entries. + * + * This is done after we've performed all the fixes and tweaks to the tables. + * All functions which modify them are __init functions, which won't exist + * after free_initmem(). */ -__init void e820_reallocate_tables(void) +__init void e820__reallocate_tables(void) { - struct e820map *n; + struct e820_table *n; int size; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820, size); - e820 = n; + memcpy(n, e820_table, size); + e820_table = n; - size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_saved, size); - e820_saved = n; + memcpy(n, e820_table_firmware, size); + e820_table_firmware = n; } -/** - * Because of the size limitation of struct boot_params, only first - * 128 E820 memory entries are passed to kernel via - * boot_params.e820_map, others are passed via SETUP_E820_EXT node of - * linked list of struct setup_data, which is parsed here. +/* + * Because of the small fixed size of struct boot_params, only the first + * 128 E820 memory entries are passed to the kernel via boot_params.e820_table, + * the remaining (if any) entries are passed via the SETUP_E820_EXT node of + * struct setup_data, which is parsed here. */ -void __init parse_e820_ext(u64 phys_addr, u32 data_len) +void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) { int entries; - struct e820entry *extmap; + struct boot_e820_entry *extmap; struct setup_data *sdata; sdata = early_memremap(phys_addr, data_len); - entries = sdata->len / sizeof(struct e820entry); - extmap = (struct e820entry *)(sdata->data); - __append_e820_map(extmap, entries); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + entries = sdata->len / sizeof(*extmap); + extmap = (struct boot_e820_entry *)(sdata->data); + + __append_e820_table(extmap, entries); + e820__update_table(e820_table); + early_memunmap(sdata, data_len); - printk(KERN_INFO "e820: extended physical RAM map:\n"); - e820_print_map("extended"); + pr_info("e820: extended physical RAM map:\n"); + e820__print_table("extended"); } -#if defined(CONFIG_X86_64) || \ - (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) -/** +/* * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for - * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * E820 RAM areas and register the corresponding pages as 'nosave' for + * hibernation (32-bit) or software suspend and suspend to RAM (64-bit). * - * This function requires the e820 map to be sorted and without any + * This function requires the E820 map to be sorted and without any * overlapping entries. */ -void __init e820_mark_nosave_regions(unsigned long limit_pfn) +void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; unsigned long pfn = 0; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(ei->addr)) - register_nosave_region(pfn, PFN_UP(ei->addr)); + if (pfn < PFN_UP(entry->addr)) + register_nosave_region(pfn, PFN_UP(entry->addr)); - pfn = PFN_DOWN(ei->addr + ei->size); + pfn = PFN_DOWN(entry->addr + entry->size); - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) - register_nosave_region(PFN_UP(ei->addr), pfn); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) break; } } -#endif #ifdef CONFIG_ACPI -/** - * Mark ACPI NVS memory region, so that we can save/restore it during - * hibernation and the subsequent resume. +/* + * Register ACPI NVS memory regions, so that we can save/restore them during + * hibernation and the subsequent resume: */ -static int __init e820_mark_nvs_memory(void) +static int __init e820__register_nvs_regions(void) { int i; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - if (ei->type == E820_NVS) - acpi_nvs_register(ei->addr, ei->size); + if (entry->type == E820_TYPE_NVS) + acpi_nvs_register(entry->addr, entry->size); } return 0; } -core_initcall(e820_mark_nvs_memory); +core_initcall(e820__register_nvs_regions); #endif /* - * pre allocated 4k and reserved it in memblock and e820_saved + * Allocate the requested number of bytes with the requsted alignment + * and return (the physical address) to the caller. Also register this + * range in the 'firmware' E820 table as a reserved range. + * + * This allows kexec to fake a new mptable, as if it came from the real + * system. */ -u64 __init early_reserve_e820(u64 size, u64 align) +u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) { u64 addr; addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); - update_e820_saved(); + e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n"); + e820__update_table_firmware(); } return addr; @@ -779,22 +759,22 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; unsigned long start_pfn; unsigned long end_pfn; - if (ei->type != type) + if (entry->type != type) continue; - start_pfn = ei->addr >> PAGE_SHIFT; - end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + start_pfn = entry->addr >> PAGE_SHIFT; + end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT; if (start_pfn >= limit_pfn) continue; @@ -809,18 +789,19 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", + pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } -unsigned long __init e820_end_of_ram_pfn(void) + +unsigned long __init e820__end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); + return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); } -unsigned long __init e820_end_of_low_ram_pfn(void) +unsigned long __init e820__end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); } static void __init early_panic(char *msg) @@ -831,7 +812,7 @@ static void __init early_panic(char *msg) static int userdef __initdata; -/* "mem=nopentium" disables the 4MB page tables. */ +/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ static int __init parse_memopt(char *p) { u64 mem_size; @@ -844,17 +825,19 @@ static int __init parse_memopt(char *p) setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; #else - printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + pr_warn("mem=nopentium ignored! (only supported on x86_32)\n"); return -EINVAL; #endif } userdef = 1; mem_size = memparse(p, &p); - /* don't remove all of memory when handling "mem={invalid}" param */ + + /* Don't remove all memory when getting "mem={invalid}" parameter: */ if (mem_size == 0) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); return 0; } @@ -872,12 +855,12 @@ static int __init parse_memmap_one(char *p) #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know - * the real mem size before original memory map is + * the real memory size before the original memory map is * reset. */ - saved_max_pfn = e820_end_of_ram_pfn(); + saved_max_pfn = e820__end_of_ram_pfn(); #endif - e820->nr_map = 0; + e820_table->nr_entries = 0; userdef = 1; return 0; } @@ -890,21 +873,23 @@ static int __init parse_memmap_one(char *p) userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RAM); + e820__range_add(start_at, mem_size, E820_TYPE_RAM); } else if (*p == '#') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_ACPI); + e820__range_add(start_at, mem_size, E820_TYPE_ACPI); } else if (*p == '$') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_RESERVED); + e820__range_add(start_at, mem_size, E820_TYPE_RESERVED); } else if (*p == '!') { start_at = memparse(p+1, &p); - e820_add_region(start_at, mem_size, E820_PRAM); - } else - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else { + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + } return *p == '\0' ? 0 : -EINVAL; } + static int __init parse_memmap_opt(char *str) { while (str) { @@ -921,68 +906,97 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt); -void __init finish_e820_parsing(void) +/* + * Reserve all entries from the bootloader's extensible data nodes list, + * because if present we are going to use it later on to fetch e820 + * entries from it: + */ +void __init e820__reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + pa_data = data->next; + early_memunmap(data, sizeof(*data)); + } + + e820__update_table(e820_table); + + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("extended physical RAM map:\n"); + e820__print_table("reserve setup_data"); +} + +/* + * Called after parse_early_param(), after early parameters (such as mem=) + * have been processed, in which case we already have an E820 table filled in + * via the parameter callback function(s), but it's not sorted and printed yet: + */ +void __init e820__finish_early_params(void) { if (userdef) { - if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), - &e820->nr_map) < 0) + if (e820__update_table(e820_table) < 0) early_panic("Invalid user supplied memory map"); - printk(KERN_INFO "e820: user-defined physical RAM map:\n"); - e820_print_map("user"); + pr_info("e820: user-defined physical RAM map:\n"); + e820__print_table("user"); } } -static const char *__init e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: return "System RAM"; - case E820_ACPI: return "ACPI Tables"; - case E820_NVS: return "ACPI Non-volatile Storage"; - case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent Memory (legacy)"; - case E820_PMEM: return "Persistent Memory"; - default: return "reserved"; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return "System RAM"; + case E820_TYPE_ACPI: return "ACPI Tables"; + case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; + case E820_TYPE_UNUSABLE: return "Unusable memory"; + case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; + case E820_TYPE_PMEM: return "Persistent Memory"; + case E820_TYPE_RESERVED: return "Reserved"; + default: return "Unknown E820 type"; } } -static unsigned long __init e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { - switch (e820_type) { - case E820_RESERVED_KERN: - case E820_RAM: - return IORESOURCE_SYSTEM_RAM; - case E820_ACPI: - case E820_NVS: - case E820_UNUSABLE: - case E820_PRAM: - case E820_PMEM: - default: - return IORESOURCE_MEM; + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; + case E820_TYPE_ACPI: /* Fall-through: */ + case E820_TYPE_NVS: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_PRAM: /* Fall-through: */ + case E820_TYPE_PMEM: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORESOURCE_MEM; } } -static unsigned long __init e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) { - switch (e820_type) { - case E820_ACPI: - return IORES_DESC_ACPI_TABLES; - case E820_NVS: - return IORES_DESC_ACPI_NV_STORAGE; - case E820_PMEM: - return IORES_DESC_PERSISTENT_MEMORY; - case E820_PRAM: - return IORES_DESC_PERSISTENT_MEMORY_LEGACY; - case E820_RESERVED_KERN: - case E820_RAM: - case E820_UNUSABLE: - default: - return IORES_DESC_NONE; + switch (entry->type) { + case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; + case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; + case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; + case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + default: return IORES_DESC_NONE; } } -static bool __init do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(enum e820_type type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -993,61 +1007,71 @@ static bool __init do_mark_busy(u32 type, struct resource *res) * for exclusive use of a driver */ switch (type) { - case E820_RESERVED: - case E820_PRAM: - case E820_PMEM: + case E820_TYPE_RESERVED: + case E820_TYPE_PRAM: + case E820_TYPE_PMEM: return false; + case E820_TYPE_RESERVED_KERN: + case E820_TYPE_RAM: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: default: return true; } } /* - * Mark e820 reserved areas as busy for the resource manager. + * Mark E820 reserved areas as busy for the resource manager: */ + static struct resource __initdata *e820_res; -void __init e820_reserve_resources(void) + +void __init e820__reserve_resources(void) { int i; struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); + res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries); e820_res = res; - for (i = 0; i < e820->nr_map; i++) { - end = e820->map[i].addr + e820->map[i].size - 1; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + + end = entry->addr + entry->size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820->map[i].type); - res->start = e820->map[i].addr; - res->end = end; - - res->flags = e820_type_to_iomem_type(e820->map[i].type); - res->desc = e820_type_to_iores_desc(e820->map[i].type); + res->start = entry->addr; + res->end = end; + res->name = e820_type_to_string(entry); + res->flags = e820_type_to_iomem_type(entry); + res->desc = e820_type_to_iores_desc(entry); /* - * don't register the region that could be conflicted with - * pci device BAR resource and insert them later in - * pcibios_resource_survey() + * Don't register the region that could be conflicted with + * PCI device BAR resources and insert them later in + * pcibios_resource_survey(): */ - if (do_mark_busy(e820->map[i].type, res)) { + if (do_mark_busy(entry->type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved->nr_map; i++) { - struct e820entry *entry = &e820_saved->map[i]; - firmware_map_add_early(entry->addr, - entry->addr + entry->size, - e820_type_to_string(entry->type)); + for (i = 0; i < e820_table_firmware->nr_entries; i++) { + struct e820_entry *entry = e820_table_firmware->entries + i; + + firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } } -/* How much should we pad RAM ending depending on where it is? */ +/* + * How much should we pad the end of RAM, depending on where it is? + */ static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1066,64 +1090,59 @@ static unsigned long __init ram_alignment(resource_size_t pos) #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -void __init e820_reserve_resources_late(void) +void __init e820__reserve_resources_late(void) { int i; struct resource *res; res = e820_res; - for (i = 0; i < e820->nr_map; i++) { + for (i = 0; i < e820_table->nr_entries; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; } /* - * Try to bump up RAM regions to reasonable boundaries to + * Try to bump up RAM regions to reasonable boundaries, to * avoid stolen RAM: */ - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; u64 start, end; - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) continue; + start = entry->addr + entry->size; end = round_up(start, ram_alignment(start)) - 1; if (end > MAX_RESOURCE_SIZE) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG - "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", - start, end); - reserve_region_with_split(&iomem_resource, start, end, - "RAM buffer"); + + printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } } -char *__init default_machine_specific_memory_setup(void) +/* + * Pass the firmware (bootloader) E820 map to the kernel and process it: + */ +char *__init e820__memory_setup_default(void) { char *who = "BIOS-e820"; - u32 new_nr; + /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { + if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) { u64 mem_size; - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { + /* Compare results from other methods and take the one that gives more RAM: */ + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { @@ -1131,84 +1150,68 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820->nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + e820_table->nr_entries = 0; + e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM); + e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM); } - /* In case someone cares... */ + /* We just appended a lot of ranges, sanitize the table: */ + e820__update_table(e820_table); + return who; } -void __init setup_memory_map(void) +/* + * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader + * E820 map - with an optional platform quirk available for virtual platforms + * to override this method of boot environment processing: + */ +void __init e820__memory_setup(void) { char *who; + /* This is a firmware interface ABI - make sure we don't break it: */ + BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20); + who = x86_init.resources.memory_setup(); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); - e820_print_map(who); + + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("e820: BIOS-provided physical RAM map:\n"); + e820__print_table(who); } -void __init memblock_x86_fill(void) +void __init e820__memblock_setup(void) { int i; u64 end; /* - * EFI may have more than 128 entries - * We are safe to enable resizing, beause memblock_x86_fill() - * is rather later for x86 + * The bootstrap memblock region count maximum is 128 entries + * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries + * than that - so allow memblock resizing. + * + * This is safe, because this call happens pretty late during x86 setup, + * so we know about reserved memory regions already. (This is important + * so that memblock resizing does no stomp over reserved areas.) */ memblock_allow_resize(); - for (i = 0; i < e820->nr_map; i++) { - struct e820entry *ei = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; - end = ei->addr + ei->size; + end = entry->addr + entry->size; if (end != (resource_size_t)end) continue; - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; - memblock_add(ei->addr, ei->size); + memblock_add(entry->addr, entry->size); } - /* throw away partial pages */ + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); memblock_dump_all(); } - -void __init memblock_find_dma_reserve(void) -{ -#ifdef CONFIG_X86_64 - u64 nr_pages = 0, nr_free_pages = 0; - unsigned long start_pfn, end_pfn; - phys_addr_t start, end; - int i; - u64 u; - - /* - * need to find out used area below MAX_DMA_PFN - * need to use memblock to get free size in [0, MAX_DMA_PFN] - * at first, and assume boot_mem will not take below MAX_DMA_PFN - */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min(start_pfn, MAX_DMA_PFN); - end_pfn = min(end_pfn, MAX_DMA_PFN); - nr_pages += end_pfn - start_pfn; - } - - for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) { - start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); - end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); - if (start_pfn < end_pfn) - nr_free_pages += end_pfn - start_pfn; - } - - set_dma_reserve(nr_pages - nr_free_pages); -#endif -} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6a08e25a48d8..d907c3d8633f 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -526,6 +526,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_SKL_IDS(&gen9_early_ops), INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), + INTEL_GLK_IDS(&gen9_early_ops), }; static void __init @@ -546,8 +547,8 @@ intel_graphics_stolen(int num, int slot, int func, &base, &end); /* Mark this space as reserved */ - e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_add(base, size, E820_TYPE_RESERVED); + e820__update_table(e820_table); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 8a121991e5ba..0f0840304452 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -17,6 +17,7 @@ #include <asm/intel-mid.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> +#include <linux/usb/xhci-dbgp.h> #include <linux/efi.h> #include <asm/efi.h> #include <asm/pci_x86.h> @@ -381,6 +382,10 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "efi", 3)) early_console_register(&early_efi_console, keep); #endif +#ifdef CONFIG_EARLY_PRINTK_USB_XDBC + if (!strncmp(buf, "xdbc", 4)) + early_xdbc_parse_parameter(buf + 4); +#endif buf++; } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..8e598a1ad986 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -50,11 +50,11 @@ #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) /* There is address space for how many espfix pages? */ -#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) +#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS -# error "Need more than one PGD for the ESPFIX hack" +# error "Need more virtual address space for the ESPFIX hack" #endif #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) @@ -121,11 +121,13 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { - pgd_t *pgd_p; + pgd_t *pgd; + p4d_t *p4d; /* Install the espfix pud into the kernel page directory */ - pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; - pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); + p4d_populate(&init_mm, p4d, espfix_pud_page); /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8639bb2ae058..0651e974dcb3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,7 +24,7 @@ #include <trace/syscall.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -533,9 +533,15 @@ static void do_sync_core(void *data) static void run_sync(void) { - int enable_irqs = irqs_disabled(); + int enable_irqs; - /* We may be called with interrupts disbled (on bootup). */ + /* No need to sync if there's only one CPU */ + if (num_online_cpus() == 1) + return; + + enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) local_irq_enable(); on_each_cpu(do_sync_core, NULL, 1); @@ -983,6 +989,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long return_hooker = (unsigned long) &return_to_handler; + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + if (unlikely(ftrace_graph_is_dead())) return; diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S new file mode 100644 index 000000000000..722a145b4139 --- /dev/null +++ b/arch/x86/kernel/ftrace_32.S @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2017 Steven Rostedt, VMware Inc. + */ + +#include <linux/linkage.h> +#include <asm/page_types.h> +#include <asm/segment.h> +#include <asm/export.h> +#include <asm/ftrace.h> + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) +#else +# define function_hook mcount +EXPORT_SYMBOL(mcount) +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */ +#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER) +# define USING_FRAME_POINTER +#endif + +#ifdef USING_FRAME_POINTER +# define MCOUNT_FRAME 1 /* using frame = true */ +#else +# define MCOUNT_FRAME 0 /* using frame = false */ +#endif + +ENTRY(function_hook) + ret +END(function_hook) + +ENTRY(ftrace_caller) + +#ifdef USING_FRAME_POINTER +# ifdef CC_USING_FENTRY + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ + pushl %ebp + movl %esp, %ebp + pushl 2*4(%esp) /* function ip */ +# endif + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + +#ifdef USING_FRAME_POINTER + /* Load parent ebp into edx */ + movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ + /* Get the parent ip */ + movl 4(%edx), %edx /* edx has ebp */ + + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +#ifdef USING_FRAME_POINTER + popl %ebp +# ifdef CC_USING_FENTRY + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +# endif +#endif +.Lftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * regs->ip location, and move flags into the return ip location. + */ + pushl $__KERNEL_CS + pushl 4(%esp) /* Save the return ip */ + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + + /* Get flags and place them into the return ip slot */ + pushf + popl %eax + movl %eax, 8*4(%esp) + + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ +#ifdef CC_USING_FENTRY + movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ +#else + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ +#endif + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + + /* restore flags */ + push 14*4(%esp) + popf + + /* Move return ip back to its original location */ + movl 12*4(%esp), %eax + movl %eax, 14*4(%esp) + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + + /* use lea to not affect flags */ + lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + + jmp .Lftrace_ret +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz .Ltrace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +.Ltrace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 3*4(%esp), %eax + /* Even with frame pointers, fentry doesn't have one here */ +#ifdef CC_USING_FENTRY + lea 4*4(%esp), %edx + movl $0, %ecx +#else + lea 0x4(%ebp), %edx + movl (%ebp), %ecx +#endif + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx +#ifdef CC_USING_FENTRY + movl $0, %eax +#else + movl %ebp, %eax +#endif + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/ftrace_64.S index 7b0d3da52fb4..1dfac634bbf7 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/mcount_64.S - * * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ @@ -13,9 +11,6 @@ .code64 .section .entry.text, "ax" - -#ifdef CONFIG_FUNCTION_TRACER - #ifdef CC_USING_FENTRY # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -297,7 +292,6 @@ trace: jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index e5fb436a6548..538ec012b371 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,7 +12,7 @@ #include <asm/setup.h> #include <asm/sections.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/page.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372f5dbb..43b7002f44fb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ +#define DISABLE_BRANCH_PROFILING #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> @@ -23,7 +24,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b467b14b03eb..ac9d327d2e42 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -269,10 +269,8 @@ ENTRY(secondary_startup_64) /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - jmp start_cpu -ENDPROC(secondary_startup_64) -ENTRY(start_cpu) +.Ljump_to_C_code: /* * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump @@ -305,7 +303,7 @@ ENTRY(start_cpu) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(start_cpu) +ENDPROC(secondary_startup_64) #include "verify_cpu.S" @@ -313,11 +311,11 @@ ENDPROC(start_cpu) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary() via start_cpu(). + * start_secondary() via .Ljump_to_C_code. */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp - jmp start_cpu + jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dc6ba5bda9fc..89ff7af2de50 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -354,7 +354,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_irq(hdev->irq); + disable_hardirq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index be22f5a2192e..4e3b8a587c88 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -418,6 +418,7 @@ struct legacy_pic default_legacy_pic = { }; struct legacy_pic *legacy_pic = &default_legacy_pic; +EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4d8183b5f113..f34fe7444836 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -394,6 +394,9 @@ int check_irq_vectors_for_cpu_disable(void) !cpumask_subset(&affinity_new, &online_new)) this_count++; } + /* No need to check any further. */ + if (!this_count) + return 0; count = 0; for_each_online_cpu(cpu) { @@ -411,8 +414,10 @@ int check_irq_vectors_for_cpu_disable(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { if (!test_bit(vector, used_vectors) && - IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - count++; + IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { + if (++count == this_count) + return 0; + } } } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1423ab1b0312..7468c6987547 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -195,7 +195,5 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); -#ifdef CONFIG_X86_32 irq_ctx_init(smp_processor_id()); -#endif } diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index bdb83e431d89..38b64587b31b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -167,7 +167,7 @@ static int __init boot_params_kdebugfs_init(void) struct dentry *dbp, *version, *data; int error = -ENOMEM; - dbp = debugfs_create_dir("boot_params", NULL); + dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); if (!dbp) return -ENOMEM; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index d0a814a9d96a..9d7fd5e6689a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/crash.h> #include <asm/efi.h> +#include <asm/e820/api.h> #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ @@ -99,15 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved->nr_map; + nr_e820_entries = e820_table_firmware->nr_entries; - /* TODO: Pass entries more than E820MAX in bootparams setup data */ - if (nr_e820_entries > E820MAX) - nr_e820_entries = E820MAX; + /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ + if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) + nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved->map, - nr_e820_entries * sizeof(struct e820entry)); + memcpy(¶ms->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } @@ -232,10 +232,10 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, nr_e820_entries = params->e820_entries; for (i = 0; i < nr_e820_entries; i++) { - if (params->e820_map[i].type != E820_RAM) + if (params->e820_table[i].type != E820_TYPE_RAM) continue; - start = params->e820_map[i].addr; - end = params->e820_map[i].addr + params->e820_map[i].size - 1; + start = params->e820_table[i].addr; + end = params->e820_table[i].addr + params->e820_table[i].size - 1; if ((start <= 0x100000) && end > 0x100000) { mem_k = (end >> 10) - (0x100000 >> 10); diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c6ee63f927ab..db2182d63ed0 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction); +extern int can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); @@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src); +extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); /* Generate a relative-jump/call instruction */ extern void synthesize_reljump(void *from, void *to); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6384eb754a58..5b2bbfbb3712 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -61,6 +61,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -164,42 +165,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) NOKPROBE_SYMBOL(skip_prefixes); /* - * Returns non-zero if opcode is boostable. + * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes) +int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables((unsigned long)opcodes)) + if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], (unsigned long *)twobyte_is_boostable); - } + + if (insn->opcode.nbytes != 1) + return 0; + + /* Can't boost Address-size override prefix */ + if (unlikely(inat_is_address_size_prefix(insn->attr))) + return 0; + + opcode = insn->opcode.bytes[0]; switch (opcode & 0xf0) { -#ifdef CONFIG_X86_64 - case 0x40: - goto retry; /* REX prefix is boostable */ -#endif case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); + /* can't boost "bound" */ + return (opcode != 0x62); case 0x70: return 0; /* can't boost conditional jump */ + case 0x90: + return opcode != 0x9a; /* can't boost call far */ case 0xc0: /* can't boost software-interruptions */ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; @@ -210,14 +207,9 @@ retry: /* can boost in/out and absolute jmps */ return ((opcode & 0x04) || opcode == 0xea); case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ /* clear and set flags are boostable */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* segment override prefixes are boostable */ - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ /* CS override prefix and call are not boostable */ return (opcode != 0x2e && opcode != 0x9a); } @@ -264,7 +256,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (faddr) memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); else @@ -276,7 +271,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. - * Returns zero if the instruction can not get recovered. + * Returns zero if the instruction can not get recovered (or access failed). */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -348,37 +343,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn) } /* - * Copy an instruction and adjust the displacement if the instruction - * uses the %rip-relative addressing mode. - * If it does, Return the address of the 32-bit displacement word. - * If not, return null. - * Only applicable to 64-bit x86. + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) { - struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); - if (!recovered_insn) + if (!recovered_insn || !insn) + return 0; + + /* This can access kernel text if given address is not recovered */ + if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); - length = insn.length; + + kernel_insn_init(insn, dest, MAX_INSN_SIZE); + insn_get_length(insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 - if (insn_rip_relative(&insn)) { + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, length); - insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing * mode. Adjust the displacement for the difference between @@ -391,36 +385,57 @@ int __copy_instruction(u8 *dest, u8 *src) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) dest; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); + pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", + src, dest, insn->displacement.value); return 0; } - disp = (u8 *) dest + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(insn); *(s32 *) disp = (s32) newdisp; } #endif - return length; + return insn->length; +} + +/* Prepare reljump right after instruction to boost */ +static void prepare_boost(struct kprobe *p, struct insn *insn) +{ + if (can_boost(insn, p->addr) && + MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(p->ainsn.insn + insn->length, + p->addr + insn->length); + p->ainsn.boostable = true; + } else { + p->ainsn.boostable = false; + } } static int arch_copy_kprobe(struct kprobe *p) { - int ret; + struct insn insn; + int len; + + set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Copy an instruction with recovering if other optprobe modifies it.*/ - ret = __copy_instruction(p->ainsn.insn, p->addr); - if (!ret) + len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + if (!len) return -EINVAL; /* * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn)) - p->ainsn.boostable = 0; - else - p->ainsn.boostable = -1; + prepare_boost(p, &insn); + + set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Check whether the instruction modifies Interrupt Flag or not */ p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); @@ -459,7 +474,7 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } } @@ -531,7 +546,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPT) - if (p->ainsn.boostable == 1 && !p->post_handler) { + if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -851,7 +866,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, case 0xcf: case 0xea: /* jmp absolute -- ip is correct */ /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; case 0xe8: /* call relative - Fix return addr */ *tos = orig_ip + (*tos - copy_ip); @@ -876,28 +891,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, * jmp near and far, absolute indirect * ip is correct. And this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; } default: break; } - if (p->ainsn.boostable == 0) { - if ((regs->ip > copy_ip) && - (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - synthesize_reljump((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - regs->ip += orig_ip - copy_ip; no_change: diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 5f8f0b3cc674..041f7b6dfa0f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { p->ainsn.insn = NULL; - p->ainsn.boostable = -1; + p->ainsn.boostable = false; return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3d1bee9d6a72..901c640d152f 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -37,6 +37,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -65,7 +66,10 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); @@ -174,11 +178,12 @@ NOKPROBE_SYMBOL(optimized_callback); static int copy_optimized_instructions(u8 *dest, u8 *src) { + struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) + ret = __copy_instruction(dest + len, src + len, &insn); + if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; } @@ -350,6 +355,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, } buf = (u8 *)op->optinsn.insn; + set_memory_rw((unsigned long)buf & PAGE_MASK, 1); /* Copy instructions into the out-of-line buffer */ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); @@ -372,6 +378,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, (u8 *)op->kp.addr + op->optinsn.size); + set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + flush_icache_range((unsigned long) buf, (unsigned long) buf + TMPL_END_IDX + op->optinsn.size + RELATIVEJUMP_SIZE); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 14f65a5f938e..da5c09789984 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -396,9 +396,9 @@ static u64 kvm_steal_clock(int cpu) src = &per_cpu(steal_time, cpu); do { version = src->version; - rmb(); + virt_rmb(); steal = src->steal; - rmb(); + virt_rmb(); } while ((version & 1) || (version != src->version)); return steal; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..8c53c5d7a1bc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,7 @@ #include <asm/io_apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) @@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one( pgd_t *pgd, pmd_t *pmd, pte_t *pte, unsigned long vaddr, unsigned long paddr) { + p4d_t *p4d; pud_t *pud; pgd += pgd_index(vaddr); @@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one( if (!(pgd_val(*pgd) & _PAGE_PRESENT)) set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); #endif - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); if (!(pmd_val(*pmd) & _PAGE_PRESENT)) set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 307b1f4543de..ce640428d6fe 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -27,6 +27,7 @@ #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> +#include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { @@ -36,6 +37,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { + free_page((unsigned long)image->arch.p4d); free_page((unsigned long)image->arch.pud); free_page((unsigned long)image->arch.pmd); free_page((unsigned long)image->arch.pte); @@ -43,6 +45,7 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -53,13 +56,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } - pud = pud_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) @@ -194,19 +205,22 @@ static int arch_update_purgatory(struct kimage *image) /* Setup copying of backup region */ if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_dest", &image->arch.backup_load_addr, sizeof(image->arch.backup_load_addr), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_src", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_src", &image->arch.backup_src_start, sizeof(image->arch.backup_src_start), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_sz", &image->arch.backup_src_sz, sizeof(image->arch.backup_src_sz), 0); if (ret) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 477ae806c2fa..f67bd3205df7 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f8d20497383..0d904d759ff1 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -26,7 +26,7 @@ #include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/setup.h> #include <asm/smp.h> @@ -826,10 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p) } early_param("alloc_mptable", parse_alloc_mptable_opt); -void __init early_reserve_e820_mpc_new(void) +void __init e820__memblock_alloc_reserved_mpc_new(void) { if (enable_update_mptable && alloc_mptable) - mpc_new_phys = early_reserve_e820(mpc_new_length, 4); + mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4); } static int __init update_mp_table(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f088ea4c66e7..446c8aa09b9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -166,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) spin_lock_irqsave(&desc->lock, flags); /* - * most handlers of type NMI_UNKNOWN never return because - * they just assume the NMI is theirs. Just a sanity check - * to manage expectations + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). */ - WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); @@ -224,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - if (panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4797e87b0fb6..3586996fc50d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, .alloc_pud = paravirt_nop, + .alloc_p4d = paravirt_nop, .release_pte = paravirt_nop, .release_pmd = paravirt_nop, .release_pud = paravirt_nop, + .release_p4d = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, @@ -430,12 +432,19 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, + .set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .p4d_val = PTE_IDENT, + .make_p4d = PTE_IDENT, + .set_pgd = native_set_pgd, -#endif +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 0c150c06fa5a..fda7867046d0 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1007,9 +1007,8 @@ static void __init calgary_enable_translation(struct pci_dev *dev) writel(cpu_to_be32(val32), target); readl(target); /* flush */ - init_timer(&tbl->watchdog_timer); - tbl->watchdog_timer.function = &calgary_watchdog; - tbl->watchdog_timer.data = (unsigned long)dev; + setup_timer(&tbl->watchdog_timer, &calgary_watchdog, + (unsigned long)dev); mod_timer(&tbl->watchdog_timer, jiffies); } diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index d5f15c3f7b25..963e3fb56437 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -14,7 +14,7 @@ #include <asm/probe_roms.h> #include <asm/pci-direct.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mmzone.h> #include <asm/setup.h> #include <asm/sections.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f67591561711..0bb88428cbf2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> +#include <asm/prctl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -124,11 +125,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -137,15 +133,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -154,7 +145,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; +DEFINE_PER_CPU(u64, msr_misc_features_shadow); - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); +static void set_cpuid_faulting(bool on) +{ + u64 msrval; - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} - update_debugctlmsr(debugctl); +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); } + preempt_enable(); +} - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); +} - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) +{ + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - /* * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ refresh_tss_limit(); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); } /* @@ -550,3 +616,16 @@ out: put_task_stack(p); return ret; } + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4c818f8bc135..ff40e74c9181 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -56,6 +57,7 @@ #include <asm/switch_to.h> #include <asm/vm86.h> #include <asm/intel_rdt.h> +#include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) { @@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d6b784a5520d..b6840bf3940b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -52,6 +53,11 @@ #include <asm/xen/hypervisor.h> #include <asm/vdso.h> #include <asm/intel_rdt.h> +#include <asm/unistd.h> +#ifdef CONFIG_IA32_EMULATION +/* Not included via unistd.h */ +#include <asm/unistd_32_ia32.h> +#endif __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -204,7 +210,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, tls); + err = do_arch_prctl_64(p, ARCH_SET_FS, tls); if (err) goto out; } @@ -440,7 +446,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV /* * On Xen PV, IOPL bits in pt_regs->flags have no effect, and * current_pt_regs()->flags may not match the current task's @@ -493,6 +499,8 @@ void set_personality_64bit(void) clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); + /* Pretend that this comes from a 64bit execve */ + task_pt_regs(current)->orig_ax = __NR_execve; /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -505,32 +513,50 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(bool x32) +static void __set_personality_x32(void) { - /* inherit personality from parent */ +#ifdef CONFIG_X86_X32 + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; + current->personality &= ~READ_IMPLIES_EXEC; + /* + * in_compat_syscall() uses the presence of the x32 syscall bit + * flag to determine compat status. The x86 mmap() code relies on + * the syscall bitness so set x32 syscall bit right here to make + * in_compat_syscall() work during exec(). + * + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; + current->thread.status &= ~TS_COMPAT; +#endif +} + +static void __set_personality_ia32(void) +{ +#ifdef CONFIG_IA32_EMULATION + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; + current->thread.status |= TS_COMPAT; +#endif +} +void set_personality_ia32(bool x32) +{ /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); - /* Mark the associated mm as containing 32-bit tasks. */ - if (x32) { - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_X32; - current->personality &= ~READ_IMPLIES_EXEC; - /* in_compat_syscall() uses the presence of the x32 - syscall bit flag to determine compat status */ - current->thread.status &= ~TS_COMPAT; - } else { - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; - current->personality |= force_personality32; - /* Prepare the first "return" to user space */ - current->thread.status |= TS_COMPAT; - } + if (x32) + __set_personality_x32(); + else + __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); @@ -547,70 +573,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (option) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; - task->thread.gsbase = addr; + task->thread.gsbase = arg2; if (doit) { load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); } put_cpu(); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; - task->thread.fsbase = addr; + task->thread.fsbase = arg2; if (doit) { /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, arg2); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; + if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base; + if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: - return prctl_map_vdso(&vdso_image_x32, addr); + return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, addr); + return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, addr); + return prctl_map_vdso(&vdso_image_64, arg2); #endif default: @@ -621,10 +649,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl(current, code, addr); + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); } +#endif unsigned long KSTK_ESP(struct task_struct *task) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2364b23ea3e5..f37d18124648 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -396,12 +396,12 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl + * When changing the segment base, use do_arch_prctl_64 * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ if (child->thread.fsbase != value) - return do_arch_prctl(child, ARCH_SET_FS, value); + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl(child, ARCH_SET_GS, value); + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } @@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request, Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); + ret = do_arch_prctl_64(child, data, addr); break; #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e244c19a2451..2544700a2a87 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -223,6 +223,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, /* Certec */ { /* Handle problems with rebooting on Certec BPC600 */ @@ -749,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + #if defined(CONFIG_SMP) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2408c1603438..5ab3895516ac 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,5 +1,5 @@ #include <linux/ioport.h> -#include <asm/e820.h> +#include <asm/e820/api.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -25,10 +25,10 @@ static void resource_clip(struct resource *res, resource_size_t start, static void remove_e820_regions(struct resource *avail) { int i; - struct e820entry *entry; + struct e820_entry *entry; - for (i = 0; i < e820->nr_map; i++) { - entry = &e820->map[i]; + for (i = 0; i < e820_table->nr_entries; i++) { + entry = &e820_table->entries[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4bf0c8926a1c..603a1669a2ec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -70,12 +70,13 @@ #include <linux/tboot.h> #include <linux/jiffies.h> +#include <linux/usb/xhci-dbgp.h> #include <video/edid.h> #include <asm/mtrr.h> #include <asm/apic.h> #include <asm/realmode.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/efi.h> @@ -119,7 +120,7 @@ * max_low_pfn_mapped: highest direct mapped pfn under 4GB * max_pfn_mapped: highest direct mapped pfn over 4GB * - * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; @@ -173,14 +174,11 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data = { - .wp_works_ok = -1, -}; +/* cpu data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; + /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .wp_works_ok = -1, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; @@ -426,7 +424,7 @@ static void __init parse_setup_data(void) switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(pa_data, data_len); + e820__memory_setup_extended(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -441,29 +439,6 @@ static void __init parse_setup_data(void) } } -static void __init e820_reserve_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - e820_update_range(pa_data, sizeof(*data)+data->len, - E820_RAM, E820_RESERVED_KERN); - pa_data = data->next; - early_memunmap(data, sizeof(*data)); - } - - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); - memcpy(e820_saved, e820, sizeof(struct e820map)); - printk(KERN_INFO "extended physical RAM map:\n"); - e820_print_map("reserve setup_data"); -} - static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; @@ -756,16 +731,16 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. * take them out. */ - e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); } /* called before trim_bios_range() to spare extra sanitize */ @@ -775,18 +750,18 @@ static void __init e820_add_kernel_range(void) u64 size = __pa_symbol(_end) - start; /* - * Complain if .text .data and .bss are not marked as E820_RAM and + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and * attempt to fix it by adding the range. We may have a confused BIOS, * or the user may have used memmap=exactmap or memmap=xxM$yyM to * exclude kernel range. If we really are running on top non-RAM, * we will crash later anyways. */ - if (e820_all_mapped(start, start + size, E820_RAM)) + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) return; - pr_warn(".text .data .bss are not marked as E820_RAM!\n"); - e820_remove_range(start, size, E820_RAM, 0); - e820_add_region(start, size, E820_RAM); + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; @@ -837,6 +812,26 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +static void __init simple_udelay_calibration(void) +{ + unsigned int tsc_khz, cpu_khz; + unsigned long lpj; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + + tsc_khz = tsc_khz ? : cpu_khz; + if (!tsc_khz) + return; + + lpj = tsc_khz * 1000; + do_div(lpj, HZ); + loops_per_jiffy = lpj; +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -939,7 +934,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; - setup_memory_map(); + e820__memory_setup(); parse_setup_data(); copy_edd(); @@ -985,6 +980,8 @@ void __init setup_arch(char **cmdline_p) */ x86_configure_nx(); + simple_udelay_calibration(); + parse_early_param(); #ifdef CONFIG_MEMORY_HOTPLUG @@ -1028,9 +1025,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif - /* update the e820_saved too */ - e820_reserve_setup_data(); - finish_e820_parsing(); + e820__reserve_setup_data(); + e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) efi_init(); @@ -1056,11 +1052,11 @@ void __init setup_arch(char **cmdline_p) trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, - E820_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); printk(KERN_INFO "fixed physical RAM map:\n"); - e820_print_map("bad_ppro"); + e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); @@ -1070,12 +1066,12 @@ void __init setup_arch(char **cmdline_p) * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); max_possible_pfn = max_pfn; @@ -1094,7 +1090,7 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) - max_low_pfn = e820_end_of_low_ram_pfn(); + max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; @@ -1111,7 +1107,7 @@ void __init setup_arch(char **cmdline_p) early_alloc_pgt_buf(); /* - * Need to conclude brk, before memblock_x86_fill() + * Need to conclude brk, before e820__memblock_setup() * it could use memblock_find_in_range, could overlap with * brk area. */ @@ -1120,7 +1116,10 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); memblock_set_current_limit(ISA_END_ADDRESS); - memblock_x86_fill(); + e820__memblock_setup(); + + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); reserve_bios_regions(); @@ -1137,7 +1136,7 @@ void __init setup_arch(char **cmdline_p) } /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); + e820__memblock_alloc_reserved_mpc_new(); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); @@ -1226,21 +1225,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. - */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif - tboot_probe(); map_vsyscall(); @@ -1275,12 +1259,12 @@ void __init setup_arch(char **cmdline_p) kvm_guest_init(); - e820_reserve_resources(); - e820_mark_nosave_regions(max_low_pfn); + e820__reserve_resources(); + e820__register_nosave_regions(max_low_pfn); x86_init.resources.reserve_resources(); - e820_setup_gap(); + e820__setup_pci_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9820d6d977c6..bb1e8cc0bc84 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu) pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); gdt.s = 1; - write_gdt_entry(get_cpu_gdt_table(cpu), + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); #endif } @@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); + +#ifdef CONFIG_X86_32 + /* + * Sync back kernel address range. We want to make sure that + * all kernel mappings, including percpu mappings, are available + * in the smpboot asm. We can't reliably pick up percpu + * mappings using vmalloc_fault(), because exception dispatch + * needs percpu data. + */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 396c042e9d0e..cc30a74e4adb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -846,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9..71beb28600d4 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d3c66a15bbde..d798c0da451c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -33,6 +33,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> +#include <asm/virtext.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -124,7 +125,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); @@ -162,6 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; + cpu_emergency_vmxoff(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -174,6 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { ipi_entering_ack_irq(); + cpu_emergency_vmxoff(); stop_this_cpu(NULL); irq_exit(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bd1f1ad35284..f04479a8f74f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8e2b79b88e51..8dabd7bf1673 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -76,6 +76,101 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE + +#define STACKTRACE_DUMP_ONCE(task) ({ \ + static bool __section(.data.unlikely) __dumped; \ + \ + if (!__dumped) { \ + __dumped = true; \ + WARN_ON(1); \ + show_stack(task, NULL); \ + } \ +}) + +static int __save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) +{ + struct unwind_state state; + struct pt_regs *regs; + unsigned long addr; + + for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + + regs = unwind_get_entry_regs(&state); + if (regs) { + /* + * Kernel mode registers on the stack indicate an + * in-kernel interrupt or exception (e.g., preemption + * or a page fault), which can make frame pointers + * unreliable. + */ + if (!user_mode(regs)) + return -EINVAL; + + /* + * The last frame contains the user mode syscall + * pt_regs. Skip it and finish the unwind. + */ + unwind_next_frame(&state); + if (!unwind_done(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + break; + } + + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know + * about. + */ + if (!addr) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (save_stack_address(trace, addr, false)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; + + return 0; +} + +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; + + if (!try_get_task_stack(tsk)) + return -EINVAL; + + ret = __save_stack_trace_reliable(trace, tsk); + + put_task_stack(tsk); + + return ret; +} +#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ + /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -138,4 +233,3 @@ void save_stack_trace_user(struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } - diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 50215a4b9347..207b8f2582c7 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -17,6 +17,8 @@ #include <linux/uaccess.h> #include <linux/elf.h> +#include <asm/elf.h> +#include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> @@ -101,7 +103,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { + if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, if (current->flags & PF_RANDOMIZE) { *begin = randomize_page(*begin, 0x02000000); } - } else { - *begin = current->mm->mmap_legacy_base; - *end = TASK_SIZE; + return; } + + *begin = get_mmap_base(1); + *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); } unsigned long @@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) + if (!in_compat_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ @@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..4b1724059909 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -42,7 +42,7 @@ #include <asm/fixmap.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> #include "../realmode/rm/wakeup.h" @@ -68,9 +68,9 @@ void __init tboot_probe(void) * also verify that it is mapped as we expect it before calling * set_fixmap(), to reduce chance of garbage value causing crash */ - if (!e820_any_mapped(boot_params.tboot_addr, - boot_params.tboot_addr, E820_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + if (!e820__mapped_any(boot_params.tboot_addr, + boot_params.tboot_addr, E820_TYPE_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); @@ -188,12 +192,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820->nr_map; i++) { - if ((e820->map[i].type != E820_RAM) - && (e820->map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820_table->nr_entries; i++) { + if ((e820_table->entries[i].type != E820_TYPE_RAM) + && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) continue; - add_mac_region(e820->map[i].addr, e820->map[i].size); + add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = @@ -510,6 +514,9 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; + if (!intel_iommu_tboot_noforce) + return 1; + if (no_iommu || swiotlb || dmar_disabled) pr_warning("Forcing Intel-IOMMU to enabled\n"); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6c8934406dc9..dcd699baea1b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) + if (LDT_empty(info) || LDT_zero(info)) { desc->a = desc->b = 0; - else + } else { fill_ldt(desc, info); + + /* + * Always set the accessed bit so that the CPU + * doesn't try to write to the (read-only) GDT. + */ + desc->type |= 1; + } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 948443e115c1..3995d3a777d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -169,6 +169,37 @@ void ist_end_non_atomic(void) preempt_disable(); } +int is_valid_bugaddr(unsigned long addr) +{ + unsigned short ud; + + if (addr < TASK_SIZE_MAX) + return 0; + + if (probe_kernel_address((unsigned short *)addr, ud)) + return 0; + + return ud == INSN_UD0 || ud == INSN_UD2; +} + +static int fixup_bug(struct pt_regs *regs, int trapnr) +{ + if (trapnr != X86_TRAP_UD) + return 0; + + switch (report_bug(regs->ip, regs)) { + case BUG_TRAP_TYPE_NONE: + case BUG_TRAP_TYPE_BUG: + break; + + case BUG_TRAP_TYPE_WARN: + regs->ip += LEN_UD0; + return 1; + } + + return 0; +} + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) @@ -187,12 +218,15 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs, trapnr)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = trapnr; - die(str, regs, error_code); - } - return 0; + if (fixup_exception(regs, trapnr)) + return 0; + + if (fixup_bug(regs, trapnr)) + return 0; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); } return -1; @@ -255,7 +289,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } @@ -519,7 +553,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46bcda4cb1c2..714dfba6a1e7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -327,9 +327,16 @@ unsigned long long sched_clock(void) { return paravirt_sched_clock(); } + +bool using_native_sched_clock(void) +{ + return pv_time_ops.sched_clock == native_sched_clock; +} #else unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); + +bool using_native_sched_clock(void) { return true; } #endif int check_tsc_unstable(void) @@ -1112,8 +1119,10 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; + tsc_unstable = 1; - clear_sched_clock_stable(); + if (using_native_sched_clock()) + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } @@ -1135,18 +1144,20 @@ static struct clocksource clocksource_tsc = { void mark_tsc_unstable(char *reason) { - if (!tsc_unstable) { - tsc_unstable = 1; + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) clear_sched_clock_stable(); - disable_sched_clock_irqtime(); - pr_info("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_mark_unstable(&clocksource_tsc); - else { - clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; - clocksource_tsc.rating = 0; - } + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) { + clocksource_mark_unstable(&clocksource_tsc); + } else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; + clocksource_tsc.rating = 0; } } @@ -1322,6 +1333,8 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 478d15dbaee4..82c6d7f1fd73 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -1,6 +1,8 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> +#include <linux/interrupt.h> +#include <asm/sections.h> #include <asm/ptrace.h> #include <asm/bitops.h> #include <asm/stacktrace.h> @@ -23,53 +25,53 @@ val; \ }) -static void unwind_dump(struct unwind_state *state, unsigned long *sp) +static void unwind_dump(struct unwind_state *state) { static bool dumped_before = false; bool prev_zero, zero = false; - unsigned long word; + unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; if (dumped_before) return; dumped_before = true; - printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n", + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp < state->stack_info.end; sp++) { - word = READ_ONCE_NOCHECK(*sp); + for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; - prev_zero = zero; - zero = word == 0; + for (; sp < stack_info.end; sp++) { - if (zero) { - if (!prev_zero) - printk_deferred("%p: %016x ...\n", sp, 0); - continue; - } + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; + + if (zero) { + if (!prev_zero) + printk_deferred("%p: %0*x ...\n", + sp, BITS_PER_LONG/4, 0); + continue; + } - printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word); + printk_deferred("%p: %0*lx (%pB)\n", + sp, BITS_PER_LONG/4, word, (void *)word); + } } } unsigned long unwind_get_return_address(struct unwind_state *state) { - unsigned long addr; - unsigned long *addr_p = unwind_get_return_address_ptr(state); - if (unwind_done(state)) return 0; - if (state->regs && user_mode(state->regs)) - return 0; - - addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, - addr_p); - - return __kernel_text_address(addr) ? addr : 0; + return __kernel_text_address(state->ip) ? state->ip : 0; } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -82,19 +84,68 @@ static size_t regs_size(struct pt_regs *regs) return sizeof(*regs); } +static bool in_entry_code(unsigned long ip) +{ + char *addr = (char *)ip; + + if (addr >= __entry_text_start && addr < __entry_text_end) + return true; + +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + if (addr >= __irqentry_text_start && addr < __irqentry_text_end) + return true; +#endif + + return false; +} + +static inline unsigned long *last_frame(struct unwind_state *state) +{ + return (unsigned long *)task_pt_regs(state->task) - 2; +} + +#ifdef CONFIG_X86_32 +#define GCC_REALIGN_WORDS 3 +#else +#define GCC_REALIGN_WORDS 1 +#endif + +static inline unsigned long *last_aligned_frame(struct unwind_state *state) +{ + return last_frame(state) - GCC_REALIGN_WORDS; +} + static bool is_last_task_frame(struct unwind_state *state) { - unsigned long bp = (unsigned long)state->bp; - unsigned long regs = (unsigned long)task_pt_regs(state->task); + unsigned long *last_bp = last_frame(state); + unsigned long *aligned_bp = last_aligned_frame(state); /* * We have to check for the last task frame at two different locations * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame by a word in the prologue of a - * function called by head/entry code. + * change the offset of the stack frame in the prologue of a function + * called by head/entry code. Examples: + * + * <start_secondary>: + * push %edi + * lea 0x8(%esp),%edi + * and $0xfffffff8,%esp + * pushl -0x4(%edi) + * push %ebp + * mov %esp,%ebp + * + * <x86_64_start_kernel>: + * lea 0x8(%rsp),%r10 + * and $0xfffffffffffffff0,%rsp + * pushq -0x8(%r10) + * push %rbp + * mov %rsp,%rbp + * + * Note that after aligning the stack, it pushes a duplicate copy of + * the return address before pushing the frame pointer. */ - return bp == regs - FRAME_HEADER_SIZE || - bp == regs - FRAME_HEADER_SIZE - sizeof(long); + return (state->bp == last_bp || + (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1))); } /* @@ -111,26 +162,70 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } -static bool update_stack_state(struct unwind_state *state, void *addr, - size_t len) +static bool update_stack_state(struct unwind_state *state, + unsigned long *next_bp) { struct stack_info *info = &state->stack_info; - enum stack_type orig_type = info->type; + enum stack_type prev_type = info->type; + struct pt_regs *regs; + unsigned long *frame, *prev_frame_end, *addr_p, addr; + size_t len; + + if (state->regs) + prev_frame_end = (void *)state->regs + regs_size(state->regs); + else + prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; + + /* Is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + frame = (unsigned long *)regs; + len = regs_size(regs); + state->got_irq = true; + } else { + frame = next_bp; + len = FRAME_HEADER_SIZE; + } /* - * If addr isn't on the current stack, switch to the next one. + * If the next bp isn't on the current stack, switch to the next one. * * We may have to traverse multiple stacks to deal with the possibility - * that 'info->next_sp' could point to an empty stack and 'addr' could - * be on a subsequent stack. + * that info->next_sp could point to an empty stack and the next bp + * could be on a subsequent stack. */ - while (!on_stack(info, addr, len)) + while (!on_stack(info, frame, len)) if (get_stack_info(info->next_sp, state->task, info, &state->stack_mask)) return false; - if (!state->orig_sp || info->type != orig_type) - state->orig_sp = addr; + /* Make sure it only unwinds up and doesn't overlap the prev frame: */ + if (state->orig_sp && state->stack_info.type == prev_type && + frame < prev_frame_end) + return false; + + /* Move state to the next frame: */ + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + + /* Save the return address: */ + if (state->regs && user_mode(state->regs)) + state->ip = 0; + else { + addr_p = unwind_get_return_address_ptr(state); + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + addr, addr_p); + } + + /* Save the original stack pointer for unwind_dump(): */ + if (!state->orig_sp) + state->orig_sp = frame; return true; } @@ -138,14 +233,12 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; - unsigned long *next_bp, *next_frame; - size_t next_len; - enum stack_type prev_type = state->stack_info.type; + unsigned long *next_bp; if (unwind_done(state)) return false; - /* have we reached the end? */ + /* Have we reached the end? */ if (state->regs && user_mode(state->regs)) goto the_end; @@ -173,58 +266,25 @@ bool unwind_next_frame(struct unwind_state *state) */ state->regs = regs; state->bp = NULL; + state->ip = 0; return true; } - /* get the next frame pointer */ + /* Get the next frame pointer: */ if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); - - /* is the next frame pointer an encoded pointer to pt_regs? */ - regs = decode_frame_pointer(next_bp); - if (regs) { - next_frame = (unsigned long *)regs; - next_len = sizeof(*regs); - } else { - next_frame = next_bp; - next_len = FRAME_HEADER_SIZE; - } - - /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_frame, next_len)) { - /* - * Don't warn on bad regs->bp. An interrupt in entry code - * might cause a false positive warning. - */ - if (state->regs) - goto the_end; + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + /* Move to the next frame if it's safe: */ + if (!update_stack_state(state, next_bp)) goto bad_address; - } - - /* Make sure it only unwinds up and doesn't overlap the last frame: */ - if (state->stack_info.type == prev_type) { - if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) - goto bad_address; - - if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) - goto bad_address; - } - - /* move to the next frame */ - if (regs) { - state->regs = regs; - state->bp = NULL; - } else { - state->bp = next_bp; - state->regs = NULL; - } return true; bad_address: + state->error = true; + /* * When unwinding a non-current task, the task might actually be * running on another CPU, in which case it could be modifying its @@ -235,18 +295,29 @@ bad_address: if (state->task != current) goto the_end; + /* + * Don't warn if the unwinder got lost due to an interrupt in entry + * code or in the C handler before the first frame pointer got set up: + */ + if (state->got_irq && in_entry_code(state->ip)) + goto the_end; + if (state->regs && + state->regs->sp >= (unsigned long)last_aligned_frame(state) && + state->regs->sp < (unsigned long)task_pt_regs(state->task)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, - state->task->pid, next_frame); - unwind_dump(state, (unsigned long *)state->regs); + state->task->pid, next_bp); + unwind_dump(state); } else { printk_deferred_once(KERN_WARNING "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, - state->task->pid, next_frame); - unwind_dump(state, state->bp); + state->task->pid, next_bp); + unwind_dump(state); } the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; @@ -257,35 +328,24 @@ EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { - unsigned long *bp, *frame; - size_t len; + unsigned long *bp; memset(state, 0, sizeof(*state)); state->task = task; + state->got_irq = (regs); - /* don't even attempt to start from user mode regs */ + /* Don't even attempt to start from user mode regs: */ if (regs && user_mode(regs)) { state->stack_info.type = STACK_TYPE_UNKNOWN; return; } - /* set up the starting stack frame */ bp = get_frame_pointer(task, regs); - regs = decode_frame_pointer(bp); - if (regs) { - state->regs = regs; - frame = (unsigned long *)regs; - len = sizeof(*regs); - } else { - state->bp = bp; - frame = bp; - len = FRAME_HEADER_SIZE; - } - /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(frame, state->task, &state->stack_info, + /* Initialize stack info and make sure the frame data is accessible: */ + get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, frame, len); + update_stack_state(state, bp); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 22881ddcbb9f..039f36738e49 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -34,7 +34,7 @@ bool unwind_next_frame(struct unwind_state *state) return true; } - state->sp = info->next_sp; + state->sp = PTR_ALIGN(info->next_sp, sizeof(long)); } while (!get_stack_info(state->sp, state->task, info, &state->stack_mask)); @@ -49,7 +49,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; - state->sp = first_frame; + state->sp = PTR_ALIGN(first_frame, sizeof(long)); get_stack_info(first_frame, state->task, &state->stack_info, &state->stack_mask); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..7924a5356c8a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); @@ -193,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb(); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c74ae9ce8dc4..c8a3b61be0aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -146,6 +146,7 @@ SECTIONS _edata = .; } :data + BUG_TABLE . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 11a93f005268..a088b2c47f73 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,7 +14,7 @@ #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> @@ -38,7 +38,7 @@ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, - .memory_setup = default_machine_specific_memory_setup, + .memory_setup = e820__memory_setup_default, }, .mpparse = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ab8e32f7b9a8..760433b2574a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,18 +86,6 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3bff20710471..09d4b17be022 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,8 +15,6 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b8597c691..000000000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/namei.h> -#include <linux/fs.h> -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a211b2..000000000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include <linux/kvm_host.h> - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index efde6cc50875..a181ae76c71c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -876,6 +876,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; + if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) + return 1; + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2c0eea..a6fd40aade7c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -205,4 +205,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; +} + +static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_misc_features_enables & + MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 45c7306c8780..c25cfaf584e7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) u64 smbase; int ret; - if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); /* @@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; - ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } @@ -3854,6 +3854,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; + u64 msr = 0; + + ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr); + if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + ctxt->ops->cpl(ctxt)) { + return emulate_gp(ctxt, 0); + } eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); @@ -5316,6 +5323,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; + unsigned emul_flags; ctxt->mem_read.pos = 0; @@ -5330,6 +5338,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5363,7 +5372,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5392,7 +5401,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5446,7 +5455,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 73ea24d4f119..bdcd4139eca9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu, *found = NULL; + struct kvm_vcpu *vcpu; int i; s->wakeup_needed = false; @@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s) if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { if (kvm_apic_accept_pic_intr(vcpu)) { - found = vcpu; - break; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; } } - - if (!found) - return; - - kvm_make_request(KVM_REQ_EVENT, found); - kvm_vcpu_kick(found); } } @@ -239,7 +234,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; s->output = 0; @@ -273,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm) return intno; } -void kvm_pic_reset(struct kvm_kpic_state *s) +static void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, i; struct kvm_vcpu *vcpu; @@ -422,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) return ret; } -static u32 pic_ioport_read(void *opaque, u32 addr1) +static u32 pic_ioport_read(void *opaque, u32 addr) { struct kvm_kpic_state *s = opaque; - unsigned int addr; int ret; - addr = addr1; - addr &= 1; if (s->poll) { - ret = pic_poll_read(s, addr1); + ret = pic_poll_read(s, addr); s->poll = 0; } else - if (addr == 0) + if ((addr & 1) == 0) if (s->read_reg_select) ret = s->isr; else @@ -456,76 +448,64 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { unsigned char data = *(unsigned char *)val; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; if (len != 1) { pr_pic_unimpl("non byte write\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: + pic_lock(s); pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_unlock(s); break; case 0x4d0: case 0x4d1: + pic_lock(s); elcr_ioport_write(&s->pics[addr & 1], addr, data); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - pic_unlock(s); return 0; } static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - unsigned char data = 0; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; + unsigned char *data = (unsigned char *)val; if (len != 1) { memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: - data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_lock(s); + *data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_unlock(s); break; case 0x4d0: case 0x4d1: - data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_lock(s); + *data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - *(unsigned char *)val = data; - pic_unlock(s); return 0; } @@ -576,7 +556,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; if (!s->output) s->wakeup_needed = true; @@ -657,9 +637,14 @@ void kvm_pic_destroy(struct kvm *kvm) { struct kvm_pic *vpic = kvm->arch.vpic; + if (!vpic) + return; + + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; kfree(vpic); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5c07d2..bdff437acbcb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -266,11 +266,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) spin_unlock(&ioapic->lock); } -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) + if (!ioapic_in_kernel(kvm)) return; kvm_make_scan_ioapic_request(kvm); } @@ -315,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_vcpu_request_scan_ioapic(ioapic->kvm); + kvm_make_scan_ioapic_request(ioapic->kvm); break; } } @@ -624,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); - return ret; } - kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -635,37 +631,36 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + if (!ioapic) + return; + cancel_delayed_work_sync(&ioapic->eoi_inject); + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); kvm->arch.vioapic = NULL; kfree(ioapic); } -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); state->irr &= ~ioapic->irr_delivered; spin_unlock(&ioapic->lock); - return 0; } -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); - return 0; } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 1cc6e54436db..29ce19732ccf 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -105,17 +105,13 @@ do { \ #define ASSERT(x) do { } while (0) #endif -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vioapic; -} - static inline int ioapic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (ioapic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -132,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426f67b4..000000000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay <allen.m.kay@intel.com> - * Author: Weidong Han <weidong.han@intel.com> - * Author: Ben-Ami Yassour <benami@il.ibm.com> - */ - -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/stat.h> -#include <linux/iommu.h> -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 60d91c9d160c..5c24811e8b0b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) if (irqchip_split(v->kvm)) return pending_userspace_extint(v); else - return pic_irqchip(v->kvm)->output; + return v->kvm->arch.vpic->output; } else return 0; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 40d5b2cf6061..d5005cc26521 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -78,40 +78,42 @@ void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - static inline int pic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (pic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_kernel(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_in_kernel(struct kvm *kvm) { - bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; + int mode = kvm->arch.irqchip_mode; - /* Matches with wmb after initializing kvm->irq_routing. */ + /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); - return ret; + return mode != KVM_IRQCHIP_NONE; } -void kvm_pic_reset(struct kvm_kpic_state *s); - void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6825cd36d13b..3cc3b2d130a0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { - struct kvm_pic *pic = pic_irqchip(kvm); + struct kvm_pic *pic = kvm->arch.vpic; return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -232,11 +232,11 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!ioapic_in_kernel(kvm)) + if (!irqchip_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); + kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); unlock: mutex_unlock(&kvm->irq_lock); } @@ -274,42 +274,42 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int r = -EINVAL; - int delta; - unsigned max_pin; - + /* We can't check irqchip_in_kernel() here as some callers are + * currently inititalizing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: - delta = 0; + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin; switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_SLAVE: - delta = 8; + e->irqchip.pin += PIC_NUM_PINS / 2; /* fall through */ case KVM_IRQCHIP_PIC_MASTER: - if (!pic_in_kernel(kvm)) - goto out; - + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; break; case KVM_IRQCHIP_IOAPIC: - if (!ioapic_in_kernel(kvm)) - goto out; - - max_pin = KVM_IOAPIC_NUM_PINS; + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; e->set = kvm_set_ioapic_irq; break; default: - goto out; + return -EINVAL; } e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin + delta; - if (e->irqchip.pin >= max_pin) - goto out; break; case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; @@ -318,7 +318,7 @@ int kvm_set_routing_entry(struct kvm *kvm, e->msi.data = ue->u.msi.data; if (kvm_msi_route_invalid(kvm, e)) - goto out; + return -EINVAL; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -326,12 +326,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.sint = ue->u.hv_sint.sint; break; default: - goto out; + return -EINVAL; } - r = 0; -out: - return r; + return 0; } bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bad6a25067bc..c329d2894905 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -177,8 +177,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); - new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + new = kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); if (!new) goto out; @@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -2285,8 +2287,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2338,14 +2340,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2439,7 +2441,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac7810513d0e..5d3376f67794 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1498,6 +1498,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } +/** + * kvm_arch_write_log_dirty - emulate dirty page logging + * @vcpu: Guest mode vcpu + * + * Emulate arch specific page modification logging for the + * nested hypervisor + */ +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->write_log_dirty) + return kvm_x86_ops->write_log_dirty(vcpu); + + return 0; +} + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -4340,7 +4355,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -4349,6 +4365,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; + context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e91f2e4..27975807cc64 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -74,7 +74,8 @@ enum { int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -201,4 +202,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 37942e419c32..ea67dc876316 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -40,8 +40,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - slot->arch.gfn_track[i] = kvm_kvzalloc(npages * - sizeof(*slot->arch.gfn_track[i])); + slot->arch.gfn_track[i] = kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } @@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } +void kvm_page_track_cleanup(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a01105485315..56241746abbd 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,13 +23,6 @@ * so the code in this file is compiled twice, once per pte size. */ -/* - * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro - * uses for EPT without A/D paging type. - */ -extern u64 __pure __using_nonexistent_pte_bit(void) - __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); - #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #define CMPXCHG cmpxchg #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 @@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK 0 - #define PT_GUEST_DIRTY_MASK 0 - #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() - #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_DIRTY_SHIFT 9 + #define PT_GUEST_ACCESSED_SHIFT 8 + #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif +#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) +#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) + #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) @@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, + unsigned gpte) { unsigned mask; /* dirty bit is not supported, so no need to track it */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return; BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); @@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, goto no_present; /* if accessed bit is not supported prefetch non accessed gpte */ - if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; return false; @@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, int ret; /* dirty/accessed bits are not supported, so no need to update them */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return 0; for (level = walker->max_level; level >= walker->level; --level) { @@ -232,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); +#if PTTYPE == PTTYPE_EPT + if (kvm_arch_write_log_dirty(vcpu)) + return -EINVAL; +#endif pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) @@ -286,7 +284,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; + unsigned nested_access; gpa_t pte_gpa; + bool have_ad; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -299,6 +299,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, retry_walk: walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); + have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { @@ -312,7 +313,15 @@ retry_walk: walker->max_level = walker->level; ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = PT_GUEST_ACCESSED_MASK; + accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0; + + /* + * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging + * by the MOV to CR instruction are treated as reads and do not cause the + * processor to set the dirty flag in any EPT paging-structure entry. + */ + nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; + pt_access = pte_access = ACC_ALL; ++walker->level; @@ -332,7 +341,7 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK, + nested_access, &walker->fault); /* @@ -394,7 +403,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(&pte_access, pte); + FNAME(protect_clean_gpte)(mmu, &pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -485,7 +494,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -979,7 +988,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -1025,3 +1034,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT #undef PT_GUEST_ACCESSED_SHIFT +#undef PT_HAVE_ACCESSED_DIRTY diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1efe2c62b3f..c27ac6923a18 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -741,7 +741,6 @@ static int svm_hardware_enable(void) struct svm_cpu_data *sd; uint64_t efer; - struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -763,8 +762,7 @@ static int svm_hardware_enable(void) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - native_store_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.address; + gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); @@ -1198,10 +1196,13 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_CLGI); set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); set_intercept(svm, INTERCEPT_XSETBV); + if (!kvm_mwait_in_guest()) { + set_intercept(svm, INTERCEPT_MONITOR); + set_intercept(svm, INTERCEPT_MWAIT); + } + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); control->int_ctl = V_INTR_MASKING_MASK; @@ -1379,6 +1380,9 @@ static void avic_vm_destroy(struct kvm *kvm) unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + if (!avic) + return; + avic_free_vm_id(vm_data->avic_vm_id); if (vm_data->avic_logical_id_table_page) @@ -5253,6 +5257,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +static void svm_setup_mce(struct kvm_vcpu *vcpu) +{ + /* [63:9] are reserved. */ + vcpu->arch.mcg_cap &= 0x1ff; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5364,6 +5374,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, + .setup_mce = svm_setup_mce, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 283aa8601833..c6f4ad44aa95 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -251,6 +248,7 @@ struct __packed vmcs12 { u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; + u64 pml_address; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; @@ -372,6 +370,7 @@ struct __packed vmcs12 { u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; + u16 guest_pml_index; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -410,6 +409,7 @@ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; gpa_t vmxon_ptr; + bool pml_full; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -615,10 +615,6 @@ struct vcpu_vmx { int vpid; bool emulation_required; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; u32 exit_reason; /* Posted interrupt descriptor */ @@ -749,6 +745,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -774,6 +771,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(PML_ADDRESS, pml_address), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), FIELD64(GUEST_IA32_PAT, guest_ia32_pat), FIELD64(GUEST_IA32_EFER, guest_ia32_efer), @@ -914,8 +912,6 @@ static void nested_release_page_clean(struct page *page) static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); -static void kvm_cpu_vmxon(u64 addr); -static void kvm_cpu_vmxoff(void); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -935,7 +931,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -static DEFINE_PER_CPU(struct desc_ptr, host_gdt); /* * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we @@ -1239,6 +1234,11 @@ static inline bool cpu_has_vmx_invvpid_global(void) return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; } +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1285,11 +1285,6 @@ static inline bool cpu_has_vmx_invpcid(void) SECONDARY_EXEC_ENABLE_INVPCID; } -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1324,6 +1319,11 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1358,6 +1358,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -2052,14 +2057,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) */ static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *table; unsigned long v; if (!(selector & ~SEGMENT_RPL_MASK)) return 0; - table = (struct desc_struct *)gdt->address; + table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); @@ -2164,7 +2168,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - load_gdt(this_cpu_ptr(&host_gdt)); + load_fixmap_gdt(raw_smp_processor_id()); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2235,15 +2239,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - else if (!already_loaded) - loaded_vmcs_clear(vmx->loaded_vmcs); - if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2266,7 +2265,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (!already_loaded) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -2277,7 +2276,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ vmcs_writel(HOST_TR_BASE, (unsigned long)this_cpu_ptr(&cpu_tss)); - vmcs_writel(HOST_GDTR_BASE, gdt->address); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* * VM exits change the host TR limit to 0x67 after a VM @@ -2321,11 +2320,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_vcpu_pi_put(vcpu); __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; - kvm_cpu_vmxoff(); - } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); @@ -2749,11 +2743,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high); vmx->nested.nested_vmx_secondary_ctls_low = 0; vmx->nested.nested_vmx_secondary_ctls_high &= + SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -2764,14 +2758,19 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; if (cpu_has_vmx_ept_execute_only()) vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT; + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -2781,10 +2780,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. */ - if (enable_vpid) + if (enable_vpid) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - else + } else vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) @@ -3416,6 +3417,7 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); asm volatile (ASM_VMX_VMXON_RAX @@ -3458,14 +3460,8 @@ static int hardware_enable(void) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - cr4_set_bits(X86_CR4_VMXE); - - if (vmm_exclusive) { - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); - } - - native_store_gdt(this_cpu_ptr(&host_gdt)); + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); return 0; } @@ -3489,15 +3485,13 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); } static void hardware_disable(void) { - if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); - } - cr4_clear_bits(X86_CR4_VMXE); + vmclear_local_loaded_vmcss(); + kvm_cpu_vmxoff(); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -3547,11 +3541,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; + if (!kvm_mwait_in_guest()) + min |= CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING; + opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -3617,9 +3613,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4011,11 +4007,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } else { + vpid_sync_context(vpid); } } @@ -4024,6 +4021,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); } +static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) +{ + if (enable_ept) + vmx_flush_tlb(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -5285,8 +5288,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; - vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -5406,8 +5407,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void enable_nmi_window(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis() || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } @@ -5448,19 +5448,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (!is_guest_mode(vcpu)) { - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; - } - ++vcpu->stat.nmi_injections; vmx->nmi_known_unmasked = false; } @@ -5477,8 +5464,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis()) - return to_vmx(vcpu)->soft_vnmi_blocked; if (to_vmx(vcpu)->nmi_known_unmasked) return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; @@ -5488,20 +5473,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!cpu_has_virtual_nmis()) { - if (vmx->soft_vnmi_blocked != masked) { - vmx->soft_vnmi_blocked = masked; - vmx->vnmi_blocked_time = 0; - } - } else { - vmx->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } + vmx->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -5509,9 +5487,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->nested.nested_run_pending) return 0; - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); @@ -6232,21 +6207,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) unsigned long exit_qualification; gpa_t gpa; u32 error_code; - int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity == 0x2) { - printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; - return 0; + if (is_guest_mode(vcpu) + && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { + /* + * Fix up exit_qualification according to whether guest + * page table accesses are reads or writes. + */ + u64 eptp = nested_ept_get_cr3(vcpu); + if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) + exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; } /* @@ -6256,7 +6228,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -6342,7 +6313,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); @@ -6517,8 +6488,10 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); - if (!cpu_has_vmx_vpid()) + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) @@ -7093,34 +7066,24 @@ out_msr_bitmap: static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; - struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - /* The Intel VMX Instruction Reference lists a bunch of bits that - * are prerequisite to running VMXON, most notably cr4.VMXE must be - * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. We test these now: + /* + * The Intel VMX Instruction Reference lists a bunch of bits that are + * prerequisite to running VMXON, most notably cr4.VMXE must be set to + * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. But most faulting conditions + * have already been checked by hardware, prior to the VM-exit for + * VMXON. We do test guest cr4.VMXE because processor CR4 always has + * that bit set to 1 in non-root mode. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || - (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if (is_long_mode(vcpu) && !cs.l) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7147,29 +7110,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - struct kvm_segment cs; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || - (is_long_mode(vcpu) && !cs.l)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - return 1; } @@ -7258,9 +7207,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 zero = 0; gpa_t vmptr; - struct vmcs12 *vmcs12; - struct page *page; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7271,22 +7219,9 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - /* - * For accurate processor emulation, VMCLEAR beyond available - * physical memory should do nothing at all. However, it is - * possible that a nested vmx bug, not a guest hypervisor bug, - * resulted in this case, so let's shut down before doing any - * more damage: - */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 1; - } - vmcs12 = kmap(page); - vmcs12->launch_state = 0; - kunmap(page); - nested_release_page(page); + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, launch_state), + &zero, sizeof(zero)); nested_free_vmcs02(vmx, vmptr); @@ -7527,7 +7462,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &gva)) return 1; - /* _system ok, as nested_vmx_check_permission verified cpl=0 */ + /* _system ok, as hardware has verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); } @@ -7660,7 +7595,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &vmcs_gva)) return 1; - /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ + /* ok to use *_system, as hardware has verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, (void *)&to_vmx(vcpu)->nested.current_vmptr, sizeof(u64), &e)) { @@ -7693,11 +7628,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); @@ -7819,7 +7749,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -8121,6 +8050,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: @@ -8198,6 +8131,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PREEMPTION_TIMER: return false; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return false; default: return true; } @@ -8491,31 +8427,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && - !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu))))) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->soft_vnmi_blocked = 0; - } else if (vmx->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->soft_vnmi_blocked = 0; - } - } - if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8561,6 +8478,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb_ept_only(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -8586,8 +8504,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) */ if (!is_guest_mode(vcpu) || !nested_cpu_has2(get_vmcs12(&vmx->vcpu), - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); + vmx_flush_tlb_ept_only(vcpu); + } } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -8782,37 +8702,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (cpu_has_virtual_nmis()) { - if (vmx->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->soft_vnmi_blocked)) - vmx->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, @@ -8929,10 +8845,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr, cr4; - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) - vmx->entry_time = ktime_get(); - /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) @@ -9140,16 +9052,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; - if (vmx->loaded_vmcs == &vmx->vmcs01) + if (vmx->loaded_vmcs == vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; + vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); vcpu->cpu = cpu; @@ -9167,7 +9079,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); BUG_ON(r); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); } @@ -9228,11 +9140,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) goto free_msrs; - if (!vmm_exclusive) - kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); loaded_vmcs_init(vmx->loaded_vmcs); - if (!vmm_exclusive) - kvm_cpu_vmxoff(); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -9474,13 +9382,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; - if (fault->error_code & PFERR_RSVD_MASK) + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) exit_reason = EXIT_REASON_EPT_MISCONFIG; else exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -9492,17 +9407,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { + u64 eptp; + WARN_ON(mmu_is_nested(vcpu)); + eptp = nested_ept_get_cr3(vcpu); + if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + return 1; + + kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT); + VMX_EPT_EXECUTE_ONLY_BIT, + eptp & VMX_EPT_AD_ENABLE_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + return 0; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -9694,10 +9618,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; page = nested_get_page(vcpu, vmcs12->msr_bitmap); - if (!page) { - WARN_ON(1); + if (!page) return false; - } msr_bitmap_l1 = (unsigned long *)kmap(page); memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -9816,6 +9738,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u64 address = vmcs12->pml_address; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { + if (!nested_cpu_has_ept(vmcs12) || + !IS_ALIGNED(address, 4096) || + address >> maxphyaddr) + return -EINVAL; + } + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -9989,8 +9927,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; - bool nested_ept_enabled = false; + u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -10121,8 +10058,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) - exec_control |= vmcs12->secondary_vm_exec_control; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { + vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & + ~SECONDARY_EXEC_ENABLE_PML; + exec_control |= vmcs12_exec_ctrl; + } if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, @@ -10137,8 +10077,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_intr_status); } - nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; - /* * Write an illegal value to APIC_ACCESS_ADDR. Later, * nested_get_vmcs12_pages will either fix it up or @@ -10268,9 +10206,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } + if (enable_pml) { + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit, this happens + * to be equivalent to simply resetting the fields in vmcs02. + */ + ASSERT(vmx->pml_pg); + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + if (nested_cpu_has_ept(vmcs12)) { - kvm_mmu_unload(vcpu); - nested_ept_init_mmu_context(vcpu); + if (nested_ept_init_mmu_context(vcpu)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + } else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* @@ -10298,12 +10253,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_efer(vcpu, vcpu->arch.efer); /* Shadow page tables on either EPT or shadow page tables. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) return 1; - kvm_mmu_reset_context(vcpu); - if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -10339,12 +10292,16 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high) || + (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high)) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || @@ -10356,6 +10313,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.nested_vmx_entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || !nested_cr3_valid(vcpu, vmcs12->host_cr3)) @@ -10423,7 +10383,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct loaded_vmcs *vmcs02; - int cpu; u32 msr_entry_idx; u32 exit_qual; @@ -10436,18 +10395,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - + vmx_switch_vmcs(vcpu, vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, exit_qual); return 1; @@ -10460,7 +10413,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmcs12->vm_entry_msr_load_count); if (msr_entry_idx) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); return 1; @@ -11028,7 +10981,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (unlikely(vmx->fail)) vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { @@ -11072,6 +11025,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx->nested.change_vmcs01_virtual_x2apic_mode = false; vmx_set_virtual_x2apic_mode(vcpu, vcpu->arch.apic_base & X2APIC_ENABLE); + } else if (!nested_cpu_has_ept(vmcs12) && + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -11121,8 +11078,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ static void vmx_leave_nested(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); + } free_nested(to_vmx(vcpu)); } @@ -11234,6 +11193,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t gpa; + struct page *page = NULL; + u64 *pml_address; + + if (is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->nested.pml_full); + + /* + * Check if PML is enabled for the nested guest. + * Whether eptp bit 6 is set is already checked + * as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index > PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + + page = nested_get_page(vcpu, vmcs12->pml_address); + if (!page) + return 0; + + pml_address = kmap(page); + pml_address[vmcs12->guest_pml_index--] = gpa; + kunmap(page); + nested_release_page_clean(page); + } + + return 0; +} + static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -11593,6 +11592,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1faf620a6fdc..464da936c53d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,7 +27,6 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" #include "hyperv.h" @@ -1008,6 +1007,8 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, }; static unsigned num_emulated_msrs; @@ -1444,10 +1445,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 usdiff; bool matched; bool already_matched; u64 data = msr->data; + bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); @@ -1455,51 +1456,34 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - int faulted = 0; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("1: idivl %[divisor]\n" - "2: xor %%edx, %%edx\n" - " movl $0, %[faulted]\n" - "3:\n" - ".section .fixup,\"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - - : "=A"(usdiff), [faulted] "=r" (faulted) - : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); - -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* idivl overflow => difference is larger than USEC_PER_SEC */ - if (faulted) - usdiff = USEC_PER_SEC; - } else - usdiff = USEC_PER_SEC; /* disable TSC match window below */ + if (data == 0 && msr->host_initiated) { + /* + * detection of vcpu initialization -- need to sync + * with other vCPUs. This particularly helps to keep + * kvm_clock stable after CPU hotplug + */ + synchronizing = true; + } else { + u64 tsc_exp = kvm->arch.last_tsc_write + + nsec_to_cycles(vcpu, elapsed); + u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; + /* + * Special case: TSC write with a small delta (1 second) + * of virtual cycle time against real time is + * interpreted as an attempt to synchronize the CPU. + */ + synchronizing = data < tsc_exp + tsc_hz && + data + tsc_hz > tsc_exp; + } + } /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (usdiff < USEC_PER_SEC && + if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; @@ -1769,13 +1753,13 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } -static u64 __get_kvmclock_ns(struct kvm *kvm) +u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; @@ -1796,24 +1780,12 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) return __pvclock_read_cycles(&hv_clock, rdtsc()); } -u64 get_kvmclock_ns(struct kvm *kvm) -{ - unsigned long flags; - s64 ns; - - local_irq_save(flags); - ns = __get_kvmclock_ns(kvm); - local_irq_restore(flags); - - return ns; -} - static void kvm_setup_pvclock_page(struct kvm_vcpu *v) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1834,9 +1806,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1850,16 +1822,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2092,7 +2064,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2111,7 +2083,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2122,7 +2094,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2131,14 +2103,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2155,6 +2127,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: + case MSR_AMD64_DC_CFG: break; case MSR_EFER: @@ -2230,8 +2203,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) bool tmp = (msr == MSR_KVM_SYSTEM_TIME); if (ka->boot_vcpu_runs_old_kvmclock != tmp) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = tmp; } @@ -2243,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2264,7 +2236,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2331,6 +2303,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.osvw.status = data; break; + case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated || + data & ~MSR_PLATFORM_INFO_CPUID_FAULT || + (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && + cpuid_fault_enabled(vcpu))) + return 1; + vcpu->arch.msr_platform_info = data; + break; + case MSR_MISC_FEATURES_ENABLES: + if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || + (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + !supports_cpuid_fault(vcpu))) + return 1; + vcpu->arch.msr_misc_features_enables = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -2417,6 +2404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: + case MSR_AMD64_DC_CFG: msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: @@ -2545,6 +2533,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.osvw.status; break; + case MSR_PLATFORM_INFO: + msr_info->data = vcpu->arch.msr_platform_info; + break; + case MSR_MISC_FEATURES_ENABLES: + msr_info->data = vcpu->arch.msr_misc_features_enables; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2675,15 +2669,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif r = 1; break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; + case KVM_CAP_X86_GUEST_MWAIT: + r = kvm_mwait_in_guest(); + break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, @@ -2695,9 +2688,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; @@ -2713,11 +2703,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); - break; -#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; @@ -2816,11 +2801,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } -static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ @@ -2864,7 +2844,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); + kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; } @@ -2878,7 +2858,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -3124,7 +3104,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; if (events->exception.injected && - (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || + is_guest_mode(vcpu))) + return -EINVAL; + + /* INITs are latched while in SMM */ + if (events->flags & KVM_VCPUEVENT_VALID_SMM && + (events->smi.smm || events->smi.pending) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) return -EINVAL; process_nmi(vcpu); @@ -3721,22 +3708,21 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], + memcpy(&chip->chip.pic, &pic->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], + memcpy(&chip->chip.pic, &pic->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -3747,32 +3733,31 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; break; } - kvm_pic_update_irq(pic_irqchip(kvm)); + kvm_pic_update_irq(pic); return r; } @@ -4018,20 +4003,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { - mutex_lock(&kvm->slots_lock); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } r = kvm_setup_default_irq_routing(kvm); if (r) { - mutex_lock(&kvm->slots_lock); - mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ @@ -4196,10 +4175,8 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - local_irq_enable(); kvm_gen_update_masterclock(kvm); break; } @@ -4207,11 +4184,9 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); user_ns.clock = now_ns; user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - local_irq_enable(); memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; @@ -4230,7 +4205,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + r = -ENOTTY; } out: return r; @@ -5223,6 +5198,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); } +static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +{ + return emul_to_vcpu(ctxt)->arch.hflags; +} + +static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +{ + kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5262,6 +5247,8 @@ static const struct x86_emulate_ops emulate_ops = { .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, + .get_hflags = emulator_get_hflags, + .set_hflags = emulator_set_hflags, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5314,7 +5301,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5718,8 +5704,6 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - if (vcpu->arch.hflags != ctxt->emul_flags) - kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); @@ -6869,7 +6853,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * 1) We should set ->mode before checking ->requests. Please see - * the comment in kvm_make_all_cpus_request. + * the comment in kvm_vcpu_exiting_guest_mode(). * * 2) For APICv, we should set ->mode before checking PIR.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on @@ -7051,7 +7035,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -7179,7 +7163,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; goto out; } @@ -7355,6 +7339,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; + /* INITs are latched while in SMM */ + if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || + mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) + return -EINVAL; + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); @@ -7724,6 +7714,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; + + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.msr_misc_features_enables = 0; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); @@ -8068,7 +8061,6 @@ void kvm_arch_sync_events(struct kvm *kvm) { cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -8152,12 +8144,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); + kvm_pic_destroy(kvm); + kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_cleanup(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -8198,13 +8190,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->base_gfn, level) + 1; slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL); if (!linfo) goto out_free; @@ -8384,7 +8376,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (atomic_read(&vcpu->arch.nmi_queued)) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_SMI, vcpu)) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -8535,8 +8527,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -8566,11 +8559,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_ready(work->arch.token, work->gva); if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + trace_kvm_async_pf_ready(work->arch.token, work->gva); if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4ce38a..612067074905 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,6 +1,8 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H +#include <asm/processor.h> +#include <asm/mwait.h> #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) +static inline bool kvm_mwait_in_guest(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* All AMD CPUs have a working MWAIT implementation */ + return true; + case X86_VENDOR_INTEL: + /* Handle Intel below */ + break; + default: + return false; + } + + /* + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as + * they would allow guest to stop the CPU completely by disabling + * interrupts then invoking MWAIT. + */ + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return false; + + return true; +} + #endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d3289d7e78fa..99472698c931 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -67,7 +67,7 @@ #include <asm/pgtable.h> #include <asm/desc.h> #include <asm/setup.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mce.h> #include <asm/io.h> #include <asm/fpu/api.h> @@ -994,7 +994,9 @@ static struct clock_event_device lguest_clockevent = { .mult = 1, .shift = 0, .min_delta_ns = LG_CLOCK_MIN_DELTA, + .min_delta_ticks = LG_CLOCK_MIN_DELTA, .max_delta_ns = LG_CLOCK_MAX_DELTA, + .max_delta_ticks = LG_CLOCK_MAX_DELTA, }; /* @@ -1178,9 +1180,9 @@ static __init char *lguest_memory_setup(void) * The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ - e820_add_region(boot_params.e820_map[0].addr, - boot_params.e820_map[0].size, - boot_params.e820_map[0].type); + e820__range_add(boot_params.e820_table[0].addr, + boot_params.e820_table[0].size, + boot_params.e820_table[0].type); /* This string is for the boot messages. */ return "LGUEST"; diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 5e2af3a88cf5..81b1635d67de 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -14,20 +14,15 @@ * Zero a page. * %rdi - page */ -ENTRY(clear_page) - - ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp clear_page_c_e", X86_FEATURE_ERMS - +ENTRY(clear_page_rep) movl $4096/8,%ecx xorl %eax,%eax rep stosq ret -ENDPROC(clear_page) -EXPORT_SYMBOL(clear_page) +ENDPROC(clear_page_rep) +EXPORT_SYMBOL_GPL(clear_page_rep) ENTRY(clear_page_orig) - xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -47,10 +42,12 @@ ENTRY(clear_page_orig) nop ret ENDPROC(clear_page_orig) +EXPORT_SYMBOL_GPL(clear_page_orig) -ENTRY(clear_page_c_e) +ENTRY(clear_page_erms) movl $4096,%ecx xorl %eax,%eax rep stosb ret -ENDPROC(clear_page_c_e) +ENDPROC(clear_page_erms) +EXPORT_SYMBOL_GPL(clear_page_erms) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index a8e91ae89fb3..29df077cb089 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops) { u64 start, end, delay, loops = __loops; + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (loops == 0) + return; + start = rdtsc_ordered(); for (;;) { diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index 121f59c6ee54..5761a4f19455 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -8,7 +8,7 @@ #include <asm/kaslr.h> #include <asm/msr.h> #include <asm/archrandom.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/io.h> /* diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 779782f58324..9a53a06e5a3e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -290,7 +290,7 @@ EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c074799bddae..c8c6ad0d58b8 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -4,12 +4,9 @@ * For licencing details see kernel-base/COPYING */ -#include <linux/highmem.h> +#include <linux/uaccess.h> #include <linux/export.h> -#include <asm/word-at-a-time.h> -#include <linux/sched.h> - /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -34,52 +31,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); - -/** - * copy_to_user: - Copy a block of data into user space. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ -unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -} -EXPORT_SYMBOL(_copy_to_user); - -/** - * copy_from_user: - Copy a block of data from user space. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - */ -unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) -{ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} -EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1f65ff6540f0..bd057a4ffe6e 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -5,12 +5,7 @@ * Copyright 1997 Andi Kleen <ak@muc.de> * Copyright 1997 Linus Torvalds */ -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/blkdev.h> #include <linux/export.h> -#include <linux/backing-dev.h> -#include <linux/interrupt.h> #include <linux/uaccess.h> #include <asm/mmx.h> #include <asm/asm.h> @@ -201,197 +196,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) return size; } -static unsigned long -__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) -{ - int d0, d1; - __asm__ __volatile__( - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" - " jbe 2f\n" - "1: movl 64(%4), %%eax\n" - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" - " movl %%eax, 0(%3)\n" - " movl %%edx, 4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" - " movl %%eax, 8(%3)\n" - " movl %%edx, 12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" - " movl %%eax, 16(%3)\n" - " movl %%edx, 20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" - " movl %%eax, 24(%3)\n" - " movl %%edx, 28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" - " movl %%eax, 32(%3)\n" - " movl %%edx, 36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" - " movl %%eax, 40(%3)\n" - " movl %%edx, 44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" - " movl %%eax, 48(%3)\n" - " movl %%edx, 52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" - " movl %%eax, 56(%3)\n" - " movl %%edx, 60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" - " cmpl $63, %0\n" - " ja 0b\n" - "5: movl %0, %%eax\n" - " shrl $2, %0\n" - " andl $3, %%eax\n" - " cld\n" - "6: rep; movsl\n" - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" - ".section .fixup,\"ax\"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" - " pushl %%eax\n" - " xorl %%eax,%%eax\n" - " rep; stosb\n" - " popl %%eax\n" - " popl %0\n" - " jmp 8b\n" - ".previous\n" - _ASM_EXTABLE(0b,16b) - _ASM_EXTABLE(1b,16b) - _ASM_EXTABLE(2b,16b) - _ASM_EXTABLE(21b,16b) - _ASM_EXTABLE(3b,16b) - _ASM_EXTABLE(31b,16b) - _ASM_EXTABLE(4b,16b) - _ASM_EXTABLE(41b,16b) - _ASM_EXTABLE(10b,16b) - _ASM_EXTABLE(51b,16b) - _ASM_EXTABLE(11b,16b) - _ASM_EXTABLE(61b,16b) - _ASM_EXTABLE(12b,16b) - _ASM_EXTABLE(71b,16b) - _ASM_EXTABLE(13b,16b) - _ASM_EXTABLE(81b,16b) - _ASM_EXTABLE(14b,16b) - _ASM_EXTABLE(91b,16b) - _ASM_EXTABLE(6b,9b) - _ASM_EXTABLE(7b,16b) - : "=&c"(size), "=&D" (d0), "=&S" (d1) - : "1"(to), "2"(from), "0"(size) - : "eax", "edx", "memory"); - return size; -} - -/* - * Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware. - * hyoshiok@miraclelinux.com - */ - -static unsigned long __copy_user_zeroing_intel_nocache(void *to, - const void __user *from, unsigned long size) -{ - int d0, d1; - - __asm__ __volatile__( - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" - " jbe 2f\n" - "1: movl 64(%4), %%eax\n" - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" - " movnti %%eax, 0(%3)\n" - " movnti %%edx, 4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" - " movnti %%eax, 8(%3)\n" - " movnti %%edx, 12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" - " movnti %%eax, 16(%3)\n" - " movnti %%edx, 20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" - " movnti %%eax, 24(%3)\n" - " movnti %%edx, 28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" - " movnti %%eax, 32(%3)\n" - " movnti %%edx, 36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" - " movnti %%eax, 40(%3)\n" - " movnti %%edx, 44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" - " movnti %%eax, 48(%3)\n" - " movnti %%edx, 52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" - " movnti %%eax, 56(%3)\n" - " movnti %%edx, 60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" - " cmpl $63, %0\n" - " ja 0b\n" - " sfence \n" - "5: movl %0, %%eax\n" - " shrl $2, %0\n" - " andl $3, %%eax\n" - " cld\n" - "6: rep; movsl\n" - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" - ".section .fixup,\"ax\"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" - " pushl %%eax\n" - " xorl %%eax,%%eax\n" - " rep; stosb\n" - " popl %%eax\n" - " popl %0\n" - " jmp 8b\n" - ".previous\n" - _ASM_EXTABLE(0b,16b) - _ASM_EXTABLE(1b,16b) - _ASM_EXTABLE(2b,16b) - _ASM_EXTABLE(21b,16b) - _ASM_EXTABLE(3b,16b) - _ASM_EXTABLE(31b,16b) - _ASM_EXTABLE(4b,16b) - _ASM_EXTABLE(41b,16b) - _ASM_EXTABLE(10b,16b) - _ASM_EXTABLE(51b,16b) - _ASM_EXTABLE(11b,16b) - _ASM_EXTABLE(61b,16b) - _ASM_EXTABLE(12b,16b) - _ASM_EXTABLE(71b,16b) - _ASM_EXTABLE(13b,16b) - _ASM_EXTABLE(81b,16b) - _ASM_EXTABLE(14b,16b) - _ASM_EXTABLE(91b,16b) - _ASM_EXTABLE(6b,9b) - _ASM_EXTABLE(7b,16b) - : "=&c"(size), "=&D" (d0), "=&S" (d1) - : "1"(to), "2"(from), "0"(size) - : "eax", "edx", "memory"); - return size; -} - static unsigned long __copy_user_intel_nocache(void *to, const void __user *from, unsigned long size) { @@ -486,12 +290,8 @@ static unsigned long __copy_user_intel_nocache(void *to, * Leave these declared but undefined. They should not be any references to * them */ -unsigned long __copy_user_zeroing_intel(void *to, const void __user *from, - unsigned long size); unsigned long __copy_user_intel(void __user *to, const void *from, unsigned long size); -unsigned long __copy_user_zeroing_intel_nocache(void *to, - const void __user *from, unsigned long size); #endif /* CONFIG_X86_INTEL_USERCOPY */ /* Generic arbitrary sized copy. */ @@ -528,47 +328,7 @@ do { \ : "memory"); \ } while (0) -#define __copy_user_zeroing(to, from, size) \ -do { \ - int __d0, __d1, __d2; \ - __asm__ __volatile__( \ - " cmp $7,%0\n" \ - " jbe 1f\n" \ - " movl %1,%0\n" \ - " negl %0\n" \ - " andl $7,%0\n" \ - " subl %0,%3\n" \ - "4: rep; movsb\n" \ - " movl %3,%0\n" \ - " shrl $2,%0\n" \ - " andl $3,%3\n" \ - " .align 2,0x90\n" \ - "0: rep; movsl\n" \ - " movl %3,%0\n" \ - "1: rep; movsb\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "5: addl %3,%0\n" \ - " jmp 6f\n" \ - "3: lea 0(%3,%0,4),%0\n" \ - "6: pushl %0\n" \ - " pushl %%eax\n" \ - " xorl %%eax,%%eax\n" \ - " rep; stosb\n" \ - " popl %%eax\n" \ - " popl %0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(4b,5b) \ - _ASM_EXTABLE(0b,3b) \ - _ASM_EXTABLE(1b,6b) \ - : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ - : "3"(size), "0"(size), "1"(to), "2"(from) \ - : "memory"); \ -} while (0) - -unsigned long __copy_to_user_ll(void __user *to, const void *from, - unsigned long n) +unsigned long __copy_user_ll(void *to, const void *from, unsigned long n) { stac(); if (movsl_is_ok(to, from, n)) @@ -578,51 +338,7 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from, clac(); return n; } -EXPORT_SYMBOL(__copy_to_user_ll); - -unsigned long __copy_from_user_ll(void *to, const void __user *from, - unsigned long n) -{ - stac(); - if (movsl_is_ok(to, from, n)) - __copy_user_zeroing(to, from, n); - else - n = __copy_user_zeroing_intel(to, from, n); - clac(); - return n; -} -EXPORT_SYMBOL(__copy_from_user_ll); - -unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, - unsigned long n) -{ - stac(); - if (movsl_is_ok(to, from, n)) - __copy_user(to, from, n); - else - n = __copy_user_intel((void __user *)to, - (const void *)from, n); - clac(); - return n; -} -EXPORT_SYMBOL(__copy_from_user_ll_nozero); - -unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, - unsigned long n) -{ - stac(); -#ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) - n = __copy_user_zeroing_intel_nocache(to, from, n); - else - __copy_user_zeroing(to, from, n); -#else - __copy_user_zeroing(to, from, n); -#endif - clac(); - return n; -} -EXPORT_SYMBOL(__copy_from_user_ll_nocache); +EXPORT_SYMBOL(__copy_user_ll); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 69873589c0ba..3b7c40a2e3e1 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -54,15 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) -{ - if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { - return copy_user_generic((__force void *)to, (__force void *)from, len); - } - return len; -} -EXPORT_SYMBOL(copy_in_user); - /* * Try to copy last bytes and clear the rest if needed. * Since protection fault in copy_from/to_user is not a normal situation, @@ -80,9 +71,5 @@ copy_user_handle_tail(char *to, char *from, unsigned len) break; } clac(); - - /* If the destination is a kernel buffer, we always clear the end */ - if (!__addr_ok(to)) - memset(to, 0, len); return len; } diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index d1c7de095808..91f501b2da3b 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -19,7 +19,7 @@ #include <asm/types.h> #include <asm/mmzone.h> #include <asm/proto.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/pci-direct.h> #include <asm/numa.h> #include <asm/mpspec.h> diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..bce6990b1d81 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = { #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ ({ \ @@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st, } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, - unsigned long P) +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) { int i; pte_t *start; pgprotval_t prot; - start = (pte_t *) pmd_page_vaddr(addr); + start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); @@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, - unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; pmd_t *start; pgprotval_t prot; - start = (pmd_t *) pud_page_vaddr(addr); + start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { @@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); } -static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, - unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; pud_t *start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *) pgd_page_vaddr(addr); + start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); @@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define p4d_large(a) pud_large(__pud(p4d_val(a))) +#define p4d_none(a) pud_none(__pud(p4d_val(a))) +#endif + +#if PTRS_PER_P4D > 1 + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +{ + int i; + p4d_t *start; + pgprotval_t prot; + + start = (p4d_t *)pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_P4D; i++) { + st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); + if (!p4d_none(*start)) { + if (p4d_large(*start) || !p4d_present(*start)) { + prot = p4d_flags(*start); + note_page(m, st, __pgprot(prot), 2); + } else { + walk_pud_level(m, st, *start, + P + i * P4D_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) +#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) +#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) #endif static inline bool is_hypervisor_range(int idx) @@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); } else { - walk_pud_level(m, &st, *start, + walk_p4d_level(m, &st, *start, i * PGD_LEVEL_MULT); } } else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..8ad91a01cbc8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -425,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false, so the pgd may + * point to an empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on the p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; @@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1122,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 99c7805a9693..456dfdfd2249 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* - * 'pteval' can come from a pte, pmd or pud. We only check + * 'pteval' can come from a pte, pmd, pud or p4d. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. + * same value on all 4 types. */ static inline int pte_allows_gup(unsigned long pteval, int write) { @@ -106,32 +106,35 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; - int nr_start = *nr; - pte_t *ptep; + int nr_start = *nr, ret = 0; + pte_t *ptep, *ptem; - ptep = pte_offset_map(&pmd, addr); + /* + * Keep the original mapped PTE value (ptem) around since we + * might increment ptep off the end of the page when finishing + * our loop iteration. + */ + ptem = ptep = pte_offset_map(&pmd, addr); do { pte_t pte = gup_get_pte(ptep); struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) { - pte_unmap(ptep); - return 0; - } + if (pte_protnone(pte)) + break; + + if (!pte_allows_gup(pte_val(pte), write)) + break; if (pte_devmap(pte)) { pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, pages); - pte_unmap(ptep); - return 0; + break; } - } else if (!pte_allows_gup(pte_val(pte), write) || - pte_special(pte)) { - pte_unmap(ptep); - return 0; - } + } else if (pte_special(pte)) + break; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); @@ -141,9 +144,11 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); - pte_unmap(ptep - 1); + if (addr == end) + ret = 1; + pte_unmap(ptem); - return 1; + return ret; } static inline void get_head_page_multiple(struct page *page, int nr) @@ -290,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = *pudp; @@ -315,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -363,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); @@ -435,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c5066a260803..302f43fd9c28 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -12,10 +12,12 @@ #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> +#include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> +#include <asm/elf.h> #if 0 /* This is just for testing */ struct page * @@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; - info.low_limit = current->mm->mmap_legacy_base; - info.high_limit = TASK_SIZE; + info.low_limit = get_mmap_base(1); + info.high_limit = in_compat_syscall() ? + tasksize_32bit() : tasksize_64bit(); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..04210a29dd60 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 22af912d66d2..cbc87ea98751 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,8 +5,8 @@ #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ -#include <asm/cacheflush.h> -#include <asm/e820.h> +#include <asm/set_memory.h> +#include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> #include <asm/page_types.h> @@ -373,14 +373,14 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -struct range pfn_mapped[E820_X_MAX]; +struct range pfn_mapped[E820_MAX_ENTRIES]; int nr_pfn_mapped; static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) { - nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, nr_pfn_mapped, start_pfn, end_pfn); - nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); max_pfn_mapped = max(max_pfn_mapped, end_pfn); @@ -430,7 +430,7 @@ unsigned long __ref init_memory_mapping(unsigned long start, /* * We need to iterate through the E820 memory map and create direct mappings - * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply + * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply * create direct mappings for all pfns from [0 to max_low_pfn) and * [4GB to max_pfn) because of possible memory holes in high addresses * that cannot be marked as UC by fixed/variable range MTRRs. @@ -643,21 +643,40 @@ void __init init_mem_mapping(void) * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains BIOS code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. + * On x86, access has to be given to the first megabyte of RAM because that + * area traditionally contains BIOS code and data regions used by X, dosemu, + * and similar apps. Since they map the entire memory range, the whole range + * must be allowed (for mapping), but any areas that would otherwise be + * disallowed are flagged as being "zero filled" instead of rejected. + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. */ int devmem_is_allowed(unsigned long pagenr) { - if (pagenr < 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + if (page_is_ram(pagenr)) { + /* + * For disallowed memory regions in the low 1MB range, + * request that the page be shown as all zeros. + */ + if (pagenr < 256) + return 2; + return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; + } + + /* + * This must follow RAM test, since System RAM is considered a + * restricted resource under CONFIG_STRICT_IOMEM. + */ + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { + /* Low 1MB bypasses iomem restrictions. */ + if (pagenr < 256) + return 1; + + return 0; + } + + return 1; } void free_init_pages(char *what, unsigned long begin, unsigned long end) @@ -701,7 +720,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void __ref free_initmem(void) { - e820_reallocate_tables(); + e820__reallocate_tables(); free_init_pages("unused kernel", (unsigned long)(&__init_begin), @@ -724,6 +743,53 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) } #endif +/* + * Calculate the precise size of the DMA zone (first 16 MB of RAM), + * and pass it to the MM layer - to help it set zone watermarks more + * accurately. + * + * Done on 64-bit systems only for the time being, although 32-bit systems + * might benefit from this as well. + */ +void __init memblock_find_dma_reserve(void) +{ +#ifdef CONFIG_X86_64 + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + int i; + u64 u; + + /* + * Iterate over all memory ranges (free and reserved ones alike), + * to calculate the total number of pages in the first 16 MB of RAM: + */ + nr_pages = 0; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); + + nr_pages += end_pfn - start_pfn; + } + + /* + * Iterate over free memory ranges to calculate the number of free + * pages in the DMA zone, while not counting potential partial + * pages at the beginning or the end of the range: + */ + nr_free_pages = 0; + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN); + + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); +#endif +} + void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b4b53e6793f..99fb83819a5f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -38,7 +38,7 @@ #include <asm/pgtable.h> #include <asm/dma.h> #include <asm/fixmap.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/apic.h> #include <asm/bugs.h> #include <asm/tlb.h> @@ -48,7 +48,7 @@ #include <asm/sections.h> #include <asm/paravirt.h> #include <asm/setup.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/page_types.h> #include <asm/init.h> @@ -56,8 +56,6 @@ unsigned long highstart_pfn, highend_pfn; -static noinline int do_test_wp_bit(void); - bool __read_mostly __vmalloc_start_set = false; /* @@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +391,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +456,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +476,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; @@ -716,20 +724,20 @@ void __init paging_init(void) */ static void __init test_wp_bit(void) { - printk(KERN_INFO - "Checking if this processor honours the WP bit even in supervisor mode..."); + char z = 0; + + printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); - /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); - boot_cpu_data.wp_works_ok = do_test_wp_bit(); - clear_fixmap(FIX_WP_TEST); + __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); - if (!boot_cpu_data.wp_works_ok) { - printk(KERN_CONT "No.\n"); - panic("Linux doesn't support CPUs with broken WP."); - } else { + if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + clear_fixmap(FIX_WP_TEST); printk(KERN_CONT "Ok.\n"); + return; } + + printk(KERN_CONT "No.\n"); + panic("Linux doesn't support CPUs with broken WP."); } void __init mem_init(void) @@ -811,8 +819,7 @@ void __init mem_init(void) BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); - if (boot_cpu_data.wp_works_ok < 0) - test_wp_bit(); + test_wp_bit(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -840,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size) #endif #endif -/* - * This function cannot be __init, since exceptions don't work in that - * section. Put this after the callers, so that it cannot be inlined. - */ -static noinline int do_test_wp_bit(void) -{ - char tmp_reg; - int flag; - - __asm__ __volatile__( - " movb %0, %1 \n" - "1: movb %1, %0 \n" - " xorl %2, %2 \n" - "2: \n" - _ASM_EXTABLE(1b,2b) - :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), - "=q" (tmp_reg), - "=r" (flag) - :"2" (1) - :"memory"); - - return flag; -} - int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 15173d37f399..41270b96403d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -41,7 +41,7 @@ #include <asm/pgalloc.h> #include <asm/dma.h> #include <asm/fixmap.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/apic.h> #include <asm/tlb.h> #include <asm/mmu_context.h> @@ -50,7 +50,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/numa.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/init.h> #include <asm/uv/uv.h> #include <asm/setup.h> @@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end) unsigned long address; for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + pgd_t *pgd_ref = pgd_offset_k(address); + const p4d_t *p4d_ref; struct page *page; - if (pgd_none(*pgd_ref)) + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchonization on p4d level. + */ + BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, address); + + if (p4d_none(*p4d_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_page_vaddr(*p4d) + != p4d_page_vaddr(*p4d_ref)); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); spin_unlock(pgt_lock); } @@ -149,16 +159,28 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { - pud_t *pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); } - return pud_offset(pgd, vaddr); + return pud_offset(p4d, vaddr); } static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) @@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", pmd, pmd_offset(pud, 0)); } return pmd_offset(pud, vaddr); @@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) - printk(KERN_ERR "PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #03!\n"); } return pte_offset_kernel(pmd, vaddr); } -void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); set_pte(pte, new_pte); @@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; - pud_t *pud_page; + p4d_t *p4d_page; pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); @@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } - pud_page = (pud_t*)pgd_page_vaddr(*pgd); - set_pte_vaddr_pud(pud_page, vaddr, pteval); + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); } pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(vaddr); - pud = fill_pud(pgd, vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); return fill_pmd(pud, vaddr); } @@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, enum page_cache_mode cache) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgprot_t prot; @@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { pud = (pud_t *) spp_getpage(); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); } - pud = pud_offset(pgd, (unsigned long)__va(phys)); + pud = pud_offset(p4d, (unsigned long)__va(phys)); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | @@ -337,10 +379,10 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE; if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(paddr & PAGE_MASK, paddr_next, - E820_RAM) && - !e820_any_mapped(paddr & PAGE_MASK, paddr_next, - E820_RESERVED_KERN)) + !e820__mapped_any(paddr & PAGE_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PAGE_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) set_pte(pte, __pte(0)); continue; } @@ -392,10 +434,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, paddr_next = (paddr & PMD_MASK) + PMD_SIZE; if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(paddr & PMD_MASK, paddr_next, - E820_RAM) && - !e820_any_mapped(paddr & PMD_MASK, paddr_next, - E820_RESERVED_KERN)) + !e820__mapped_any(paddr & PMD_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PMD_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) set_pmd(pmd, __pmd(0)); continue; } @@ -478,10 +520,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (paddr >= paddr_end) { if (!after_bootmem && - !e820_any_mapped(paddr & PUD_MASK, paddr_next, - E820_RAM) && - !e820_any_mapped(paddr & PUD_MASK, paddr_next, - E820_RESERVED_KERN)) + !e820__mapped_any(paddr & PUD_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PUD_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) set_pud(pud, __pud(0)); continue; } @@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - if (pgd_val(*pgd)) { - pud = (pud_t *)pgd_page_vaddr(*pgd); + BUILD_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vaddr); + if (p4d_val(*p4d)) { + pud = (pud_t *)p4d_page_vaddr(*p4d); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); @@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -899,7 +962,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, continue; } - pmd_base = (pmd_t *)pud_page_vaddr(*pud); + pmd_base = pmd_offset(pud, 0); remove_pmd_table(pmd_base, addr, next, direct); free_pmd_table(pmd_base, pud); } @@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, -pages); } +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = pud_offset(p4d, 0); + remove_pud_table(pud_base, addr, next, direct); + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + /* start and end are both virtual address. */ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) @@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long next; unsigned long addr; pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) if (!pgd_present(*pgd)) continue; - pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, addr, next, direct); + p4d = p4d_offset(pgd, 0); + remove_p4d_table(p4d, addr, next, direct); } flush_tlb_all(); @@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(*pgd)) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return 0; + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return 0; @@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long addr; unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; @@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, unsigned long end = (unsigned long)(start_page + size); unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; unsigned int nr_pages; @@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa2635862d..bbc558b88a88 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -9,12 +9,13 @@ #include <linux/bootmem.h> #include <linux/init.h> #include <linux/io.h> +#include <linux/ioport.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> -#include <asm/cacheflush.h> -#include <asm/e820.h> +#include <asm/set_memory.h> +#include <asm/e820/api.h> #include <asm/fixmap.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -425,7 +426,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8d63d7a104c3..0c7d8129bed6 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt #include <linux/bootmem.h> #include <linux/kasan.h> @@ -7,11 +8,12 @@ #include <linux/sched/task.h> #include <linux/vmalloc.h> +#include <asm/e820/types.h> #include <asm/tlbflush.h> #include <asm/sections.h> extern pgd_t early_level4_pgt[PTRS_PER_PGD]; -extern struct range pfn_mapped[E820_X_MAX]; +extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) { @@ -32,8 +34,19 @@ static int __init map_range(struct range *range) static void __init clear_pgds(unsigned long start, unsigned long end) { - for (; start < end; start += PGDIR_SIZE) - pgd_clear(pgd_offset_k(start)); + pgd_t *pgd; + + for (; start < end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (CONFIG_PGTABLE_LEVELS < 5) + p4d_clear(p4d_offset(pgd, start)); + else + pgd_clear(pgd); + } } static void __init kasan_map_early_shadow(pgd_t *pgd) @@ -43,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) unsigned long end = KASAN_SHADOW_END; for (i = pgd_index(start); start < end; i++) { - pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) - | _KERNPG_TABLE); + switch (CONFIG_PGTABLE_LEVELS) { + case 4: + pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | + _KERNPG_TABLE); + break; + case 5: + pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | + _KERNPG_TABLE); + break; + default: + BUILD_BUG(); + } start += PGDIR_SIZE; } } @@ -72,6 +95,7 @@ void __init kasan_early_init(void) pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; for (i = 0; i < PTRS_PER_PTE; i++) kasan_zero_pte[i] = __pte(pte_val); @@ -82,6 +106,9 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); + for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) + kasan_zero_p4d[i] = __p4d(p4d_val); + kasan_map_early_shadow(early_level4_pgt); kasan_map_early_shadow(init_level4_pgt); } @@ -103,7 +130,7 @@ void __init kasan_init(void) kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, kasan_mem_to_shadow((void *)PAGE_OFFSET)); - for (i = 0; i < E820_X_MAX; i++) { + for (i = 0; i < E820_MAX_ENTRIES; i++) { if (pfn_mapped[i].end == 0) break; diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 887e57182716..aed206475aa7 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -48,7 +48,7 @@ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; #if defined(CONFIG_X86_ESPFIX64) static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; #elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_START; +static const unsigned long vaddr_end = EFI_VA_END; #else static const unsigned long vaddr_end = __START_KERNEL_map; #endif @@ -105,7 +105,7 @@ void __init kernel_randomize_memory(void) */ BUILD_BUG_ON(vaddr_start >= vaddr_end); BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && - vaddr_end >= EFI_VA_START); + vaddr_end >= EFI_VA_END); BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 7940166c799b..19ad095b41df 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -30,30 +30,44 @@ #include <linux/limits.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> +#include <linux/compat.h> #include <asm/elf.h> struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned long stack_maxrandom_size(void) +unsigned long tasksize_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +unsigned long tasksize_64bit(void) +{ + return TASK_SIZE_MAX; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; + max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max <<= PAGE_SHIFT; } return max; } -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole with possible stack randomization. - */ -#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) -#define MAX_GAP (TASK_SIZE/6*5) +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + +#define SIZE_128M (128 * 1024 * 1024UL) static int mmap_is_legacy(void) { @@ -66,54 +80,91 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -unsigned long arch_mmap_rnd(void) +static unsigned long arch_rnd(unsigned int rndbits) { - unsigned long rnd; - - if (mmap_is_ia32()) -#ifdef CONFIG_COMPAT - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); -#else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); -#endif - else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} - return rnd << PAGE_SHIFT; +unsigned long arch_mmap_rnd(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap_min, gap_max; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_max = (task_size / 6) * 5; - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(task_size - gap - rnd); +} - return PAGE_ALIGN(TASK_SIZE - gap - rnd); +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; } /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size) +{ + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size); +} + void arch_pick_mmap_layout(struct mm_struct *mm) { - unsigned long random_factor = 0UL; + if (mmap_is_legacy()) + mm->get_unmapped_area = arch_get_unmapped_area; + else + mm->get_unmapped_area = arch_get_unmapped_area_topdown; - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), tasksize_32bit()); +#endif +} - mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; - if (mmap_is_legacy()) { - mm->mmap_base = mm->mmap_legacy_base; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index bef36622e408..4d434ddb75db 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -32,7 +32,7 @@ #include <linux/kallsyms.h> #include <asm/pgtable.h> #include <linux/mmiotrace.h> -#include <asm/e820.h> /* for ISA_START_ADDRESS */ +#include <asm/e820/api.h> /* for ISA_START_ADDRESS */ #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/cpu.h> diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 5126dfd52b18..1c34b767c84c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -526,15 +526,7 @@ int mpx_handle_bd_fault(void) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault()) { - force_sig(SIGSEGV, current); - /* - * The force_sig() is essentially "handling" this - * exception, so we do not pass up the error - * from do_mpx_bt_fault(). - */ - } - return 0; + return do_mpx_bt_fault(); } /* @@ -590,7 +582,7 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, * we might run off the end of the bounds table if we are on * a 64-bit kernel and try to get 8 bytes. */ -int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, +static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, long __user *bd_entry_ptr) { u32 bd_entry_32; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 12dcad7297a5..25504d5aa816 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -12,7 +12,7 @@ #include <linux/sched.h> #include <linux/topology.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/amd_nb.h> @@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid) nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, MEMBLOCK_ALLOC_ACCESSIBLE); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", nd_size, nid); return; } @@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid) * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * - * Sanitize @mi by merging and removing unncessary memblks. Also check for + * Sanitize @mi by merging and removing unnecessary memblks. Also check for * conflicts and clear unused memblks. * * RETURNS: diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 28d42130243c..1dcd2be4cce4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -15,7 +15,7 @@ #include <linux/pci.h> #include <linux/vmalloc.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/sections.h> @@ -24,6 +24,7 @@ #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pat.h> +#include <asm/set_memory.h> /* * The current flushing context - we pass it instead of 5 arguments: @@ -346,6 +347,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -354,7 +356,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; @@ -406,13 +416,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) return NULL; @@ -477,11 +492,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } @@ -836,9 +853,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { - pud_t *pud = pud_offset(pgd, start); + pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? @@ -1004,8 +1021,8 @@ static long populate_pmd(struct cpa_data *cpa, return num_pages; } -static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) { pud_t *pud; unsigned long end; @@ -1026,7 +1043,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); /* * Need a PMD page? @@ -1047,7 +1064,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (cpa->numpages == cur_pages) return cur_pages; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* @@ -1067,7 +1084,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (start < end) { long tmp; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; @@ -1090,33 +1107,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + /* * Allocate a PUD page and hand it down for mapping. */ - if (pgd_none(*pgd_entry)) { + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); - ret = populate_pud(cpa, addr, pgd_entry, pgprot); + ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ - unmap_pud_range(pgd_entry, addr, + unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index efc32bc6862b..9b78685b66e6 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -10,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/bootmem.h> #include <linux/debugfs.h> +#include <linux/ioport.h> #include <linux/kernel.h> #include <linux/pfn_t.h> #include <linux/slab.h> @@ -23,7 +24,7 @@ #include <asm/x86_init.h> #include <asm/pgtable.h> #include <asm/fcntl.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/page.h> #include <asm/msr.h> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..508a708eb9a6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } + +#if CONFIG_PGTABLE_LEVELS > 4 +void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) +{ + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(p4d)); +} +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ @@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS == 4) { + CONFIG_PGTABLE_LEVELS >= 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); @@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; @@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +#ifdef CONFIG_X86_5LEVEL +/** + * p4d_set_huge - setup kernel P4D mapping + * + * No 512GB pages yet -- always return 0 + */ +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +/** + * p4d_clear_huge - clear kernel P4D mapping when it is set + * + * No 512GB pages yet -- always return 0 + */ +int p4d_clear_huge(p4d_t *p4d) +{ + return 0; +} +#endif + /** * pud_set_huge - setup kernel PUD mapping * diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce776852b..b9bd5b8b14fa 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -12,7 +12,7 @@ #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 35fe69529bc1..3ea20d61b523 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -18,7 +18,7 @@ #include <linux/mm.h> #include <asm/proto.h> #include <asm/numa.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/apic.h> #include <asm/uv/uv.h> diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 38868adf07ea..f6ae6830b341 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -9,7 +9,7 @@ #include <linux/mmiotrace.h> static unsigned long mmio_address; -module_param(mmio_address, ulong, 0); +module_param_hw(mmio_address, ulong, iomem, 0); MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " "(or 8 MB if read_far is non-zero)."); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a7655f6caf7d..6e7bedf69af7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, { struct flush_tlb_info info; - if (end == 0) - end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; @@ -289,23 +287,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - - preempt_disable(); - - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - - /* This is an implicit full barrier that synchronizes with switch_mm. */ - local_flush_tlb(); - - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); - preempt_enable(); -} - /* * See Documentation/x86/tlb.txt for details. We choose 33 * because it is large enough to cover the vast majority (at @@ -326,6 +307,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); + + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; + if (base_pages_to_flush > tlb_single_page_flush_ceiling) + base_pages_to_flush = TLB_FLUSH_ALL; + if (current->active_mm != mm) { /* Synchronize with switch_mm. */ smp_mb(); @@ -342,15 +329,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, goto out; } - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - /* * Both branches below are implicit full barriers (MOV to CR or * INVLPG) that synchronize with switch_mm. */ - if (base_pages_to_flush > tlb_single_page_flush_ceiling) { - base_pages_to_flush = TLB_FLUSH_ALL; + if (base_pages_to_flush == TLB_FLUSH_ALL) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { @@ -393,7 +376,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); + flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); preempt_enable(); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 32322ce9b405..f58939393eef 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <linux/bpf.h> int bpf_jit_enable __read_mostly; @@ -490,13 +491,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_LD | BPF_IMM | BPF_DW: - if (insn[1].code != 0 || insn[1].src_reg != 0 || - insn[1].dst_reg != 0 || insn[1].off != 0) { - /* verifier must catch invalid insns */ - pr_err("invalid BPF_LD_IMM64 insn\n"); - return -EINVAL; - } - /* optimization: if imm64 is zero, use 'xor <dst>,<dst>' * to save 7 bytes. */ diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 0cb52ae0a8f0..190e718694b1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -735,6 +735,15 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC +void pcibios_release_device(struct pci_dev *dev) +{ + if (atomic_dec_return(&dev->enable_cnt) >= 0) + pcibios_disable_device(dev); + +} +#endif + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 0a9f2caf358f..7b4307163eac 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -34,7 +34,7 @@ #include <linux/bootmem.h> #include <asm/pat.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/pci_x86.h> #include <asm/io_apic.h> @@ -398,7 +398,7 @@ void __init pcibios_resource_survey(void) list_for_each_entry(bus, &pci_root_buses, node) pcibios_allocate_resources(bus, 1); - e820_reserve_resources_late(); + e820__reserve_resources_late(); /* * Insert the IO APIC resources after PCI initialization has * occurred to handle IO APICS that are mapped in on a BAR in @@ -406,50 +406,3 @@ void __init pcibios_resource_survey(void) */ ioapic_insert_resources(); } - -static const struct vm_operations_struct pci_mmap_ops = { - .access = generic_access_phys, -}; - -int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine) -{ - unsigned long prot; - - /* I/O space cannot be accessed via normal processor loads and - * stores on this platform. - */ - if (mmap_state == pci_mmap_io) - return -EINVAL; - - prot = pgprot_val(vma->vm_page_prot); - - /* - * Return error if pat is not enabled and write_combine is requested. - * Caller can followup with UC MINUS request and add a WC mtrr if there - * is a free mtrr slot. - */ - if (!pat_enabled() && write_combine) - return -EINVAL; - - if (pat_enabled() && write_combine) - prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled() || boot_cpu_data.x86 > 3) - /* - * ioremap() and ioremap_nocache() defaults to UC MINUS for now. - * To avoid attribute conflicts, request UC MINUS here - * as well. - */ - prot |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); - - vma->vm_page_prot = __pgprot(prot); - - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) - return -EAGAIN; - - vma->vm_ops = &pci_mmap_ops; - - return 0; -} diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index dd30b7e08bc2..d1b47d5bc9c3 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -18,7 +18,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/rculist.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/pci_x86.h> #include <asm/acpi.h> @@ -423,7 +423,7 @@ static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static int is_acpi_reserved(u64 start, u64 end, unsigned not_used) +static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used) { struct resource mcfg_res; @@ -440,11 +440,11 @@ static int is_acpi_reserved(u64 start, u64 end, unsigned not_used) return mcfg_res.flags; } -typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); +typedef bool (*check_reserved_t)(u64 start, u64 end, unsigned type); -static int __ref is_mmconf_reserved(check_reserved_t is_reserved, - struct pci_mmcfg_region *cfg, - struct device *dev, int with_e820) +static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, + struct pci_mmcfg_region *cfg, + struct device *dev, int with_e820) { u64 addr = cfg->res.start; u64 size = resource_size(&cfg->res); @@ -452,7 +452,7 @@ static int __ref is_mmconf_reserved(check_reserved_t is_reserved, int num_buses; char *method = with_e820 ? "E820" : "ACPI motherboard resources"; - while (!is_reserved(addr, addr + size, E820_RESERVED)) { + while (!is_reserved(addr, addr + size, E820_TYPE_RESERVED)) { size >>= 1; if (size < (16UL<<20)) break; @@ -494,8 +494,8 @@ static int __ref is_mmconf_reserved(check_reserved_t is_reserved, return 1; } -static int __ref pci_mmcfg_check_reserved(struct device *dev, - struct pci_mmcfg_region *cfg, int early) +static bool __ref +pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int early) { if (!early && !acpi_disabled) { if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0)) @@ -514,7 +514,7 @@ static int __ref pci_mmcfg_check_reserved(struct device *dev, } /* - * e820_all_mapped() is marked as __init. + * e820__mapped_all() is marked as __init. * All entries from ACPI MCFG table have been checked at boot time. * For MCFG information constructed from hotpluggable host bridge's * _CBA method, just assume it's reserved. @@ -525,7 +525,7 @@ static int __ref pci_mmcfg_check_reserved(struct device *dev, /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1); + return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1); return 0; } diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 43984bc1665a..3e9e166f6408 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -12,7 +12,7 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/rcupdate.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/pci_x86.h> /* Assume systems with more busses have correct MCFG */ diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index bea52496aea6..f1c1aa0430ae 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -10,7 +10,7 @@ #include <linux/acpi.h> #include <linux/bitmap.h> #include <linux/rcupdate.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/pci_x86.h> #define PREFIX "PCI: " diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 1d97cea3b3a4..c1bdb9edcae7 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -7,9 +7,11 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/uaccess.h> + #include <asm/pci_x86.h> +#include <asm/e820/types.h> #include <asm/pci-functions.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index e1fb269c87af..c4b3646bd04c 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -234,23 +234,14 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; for_each_pci_msi_entry(msidesc, dev) { - __pci_read_msi_msg(msidesc, &msg); - pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | - ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (msg.data != XEN_PIRQ_MSI_DATA || - xen_irq_from_pirq(pirq) < 0) { - pirq = xen_allocate_pirq_msi(dev, msidesc); - if (pirq < 0) { - irq = -ENODEV; - goto error; - } - xen_msi_compose_msg(dev, pirq, &msg); - __pci_write_msi_msg(msidesc, &msg); - dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); - } else { - dev_dbg(&dev->dev, - "xen: msi already bound to pirq=%d\n", pirq); + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) { + irq = -ENODEV; + goto error; } + xen_msi_compose_msg(dev, pirq, &msg); + __pci_write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? @@ -456,7 +447,7 @@ void __init xen_msi_init(void) int __init pci_xen_hvm_init(void) { - if (!xen_feature(XENFEAT_hvm_pirqs)) + if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) return 0; #ifdef CONFIG_ACPI diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 066619b0700c..f1d83b34c329 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,6 +1,5 @@ OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o -obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c deleted file mode 100644 index 04ca8764f0c0..000000000000 --- a/arch/x86/platform/efi/efi-bgrt.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2012 Intel Corporation - * Author: Josh Triplett <josh@joshtriplett.org> - * - * Based on the bgrt driver: - * Copyright 2012 Red Hat, Inc <mjg@redhat.com> - * Author: Matthew Garrett - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/acpi.h> -#include <linux/efi.h> -#include <linux/efi-bgrt.h> - -struct acpi_table_bgrt bgrt_tab; -size_t __initdata bgrt_image_size; - -struct bmp_header { - u16 id; - u32 size; -} __packed; - -void __init efi_bgrt_init(struct acpi_table_header *table) -{ - void *image; - struct bmp_header bmp_header; - struct acpi_table_bgrt *bgrt = &bgrt_tab; - - if (acpi_disabled) - return; - - if (table->length < sizeof(bgrt_tab)) { - pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n", - table->length, sizeof(bgrt_tab)); - return; - } - *bgrt = *(struct acpi_table_bgrt *)table; - if (bgrt->version != 1) { - pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n", - bgrt->version); - goto out; - } - if (bgrt->status & 0xfe) { - pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n", - bgrt->status); - goto out; - } - if (bgrt->image_type != 0) { - pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n", - bgrt->image_type); - goto out; - } - if (!bgrt->image_address) { - pr_notice("Ignoring BGRT: null image address\n"); - goto out; - } - - image = early_memremap(bgrt->image_address, sizeof(bmp_header)); - if (!image) { - pr_notice("Ignoring BGRT: failed to map image header memory\n"); - goto out; - } - - memcpy(&bmp_header, image, sizeof(bmp_header)); - early_memunmap(image, sizeof(bmp_header)); - if (bmp_header.id != 0x4d42) { - pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", - bmp_header.id); - goto out; - } - bgrt_image_size = bmp_header.size; - efi_mem_reserve(bgrt->image_address, bgrt_image_size); - - return; -out: - memset(bgrt, 0, sizeof(bgrt_tab)); -} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 565dff3c9a12..7e76a4d8304b 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -47,8 +47,9 @@ #include <asm/setup.h> #include <asm/efi.h> +#include <asm/e820/api.h> #include <asm/time.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> #include <asm/uv/uv.h> @@ -139,21 +140,21 @@ static void __init do_add_efi_memmap(void) case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: if (md->attribute & EFI_MEMORY_WB) - e820_type = E820_RAM; + e820_type = E820_TYPE_RAM; else - e820_type = E820_RESERVED; + e820_type = E820_TYPE_RESERVED; break; case EFI_ACPI_RECLAIM_MEMORY: - e820_type = E820_ACPI; + e820_type = E820_TYPE_ACPI; break; case EFI_ACPI_MEMORY_NVS: - e820_type = E820_NVS; + e820_type = E820_TYPE_NVS; break; case EFI_UNUSABLE_MEMORY: - e820_type = E820_UNUSABLE; + e820_type = E820_TYPE_UNUSABLE; break; case EFI_PERSISTENT_MEMORY: - e820_type = E820_PMEM; + e820_type = E820_TYPE_PMEM; break; default: /* @@ -161,12 +162,12 @@ static void __init do_add_efi_memmap(void) * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE */ - e820_type = E820_RESERVED; + e820_type = E820_TYPE_RESERVED; break; } - e820_add_region(start, size, e820_type); + e820__range_add(start, size, e820_type); } - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); } int __init efi_memblock_x86_reserve_range(void) diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index cef39b097649..3481268da3d0 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void) load_cr3(initial_page_table); __flush_tlb_all(); - gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.address = get_cpu_gdt_paddr(0); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); @@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) { struct desc_ptr gdt_descr; - gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4695da42d77..c488625c9712 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -35,7 +35,7 @@ #include <asm/setup.h> #include <asm/page.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/proto.h> @@ -47,7 +47,7 @@ #include <asm/pgalloc.h> /* - * We allocate runtime services regions bottom-up, starting from -4G, i.e. + * We allocate runtime services regions top-down, starting from -4G, i.e. * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ static u64 efi_va = EFI_VA_START; @@ -135,6 +135,7 @@ static pgd_t *efi_pgd; int __init efi_alloc_page_tables(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; gfp_t gfp_mask; @@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void) return -ENOMEM; pgd = efi_pgd + pgd_index(EFI_VA_END); + p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END); + if (!p4d) { + free_page((unsigned long)efi_pgd); + return -ENOMEM; + } - pud = pud_alloc_one(NULL, 0); + pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { + if (CONFIG_PGTABLE_LEVELS > 4) + free_page((unsigned long) pgd_page_vaddr(*pgd)); free_page((unsigned long)efi_pgd); return -ENOMEM; } - pgd_populate(NULL, pgd, pud); - return 0; } @@ -166,6 +172,7 @@ void efi_sync_low_kernel_mappings(void) { unsigned num_entries; pgd_t *pgd_k, *pgd_efi; + p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) @@ -190,23 +197,37 @@ void efi_sync_low_kernel_mappings(void) memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); /* + * As with PGDs, we share all P4D entries apart from the one entry + * that covers the EFI runtime mapping space. + */ + BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END)); + BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK)); + + pgd_efi = efi_pgd + pgd_index(EFI_VA_END); + pgd_k = pgd_offset_k(EFI_VA_END); + p4d_efi = p4d_offset(pgd_efi, 0); + p4d_k = p4d_offset(pgd_k, 0); + + num_entries = p4d_index(EFI_VA_END); + memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries); + + /* * We share all the PUD entries apart from those that map the * EFI regions. Copy around them. */ BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); - pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - pud_efi = pud_offset(pgd_efi, 0); - - pgd_k = pgd_offset_k(EFI_VA_END); - pud_k = pud_offset(pgd_k, 0); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_END); + p4d_k = p4d_offset(pgd_k, EFI_VA_END); + pud_efi = pud_offset(p4d_efi, 0); + pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - pud_efi = pud_offset(pgd_efi, EFI_VA_START); - pud_k = pud_offset(pgd_k, EFI_VA_START); + pud_efi = pud_offset(p4d_efi, EFI_VA_START); + pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 30031d5293c4..26615991d69c 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -11,6 +11,8 @@ #include <linux/bootmem.h> #include <linux/acpi.h> #include <linux/dmi.h> + +#include <asm/e820/api.h> #include <asm/efi.h> #include <asm/uv/uv.h> @@ -201,6 +203,10 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } + /* No need to reserve regions that will never be freed. */ + if (md.attribute & EFI_MEMORY_RUNTIME) + return; + size += addr % EFI_PAGE_SIZE; size = round_up(size, EFI_PAGE_SIZE); addr = round_down(addr, EFI_PAGE_SIZE); @@ -240,14 +246,14 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) * else. We must only reserve (and then free) regions: * * - Not within any part of the kernel - * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc) + * - Not the BIOS reserved area (E820_TYPE_RESERVED, E820_TYPE_NVS, etc) */ static bool can_free_region(u64 start, u64 size) { if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end)) return false; - if (!e820_all_mapped(start, start+size, E820_RAM)) + if (!e820__mapped_all(start, start+size, E820_TYPE_RAM)) return false; return true; @@ -280,7 +286,7 @@ void __init efi_reserve_boot_services(void) * A good example of a critical region that must not be * freed is page zero (first 4Kb of memory), which may * contain boot services code/data but is marked - * E820_RESERVED by trim_bios_range(). + * E820_TYPE_RESERVED by trim_bios_range(). */ if (!already_reserved) { memblock_reserve(start, size); diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index a7dbec4dce27..53e0235e308f 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -2,8 +2,9 @@ obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o # SDHCI Devices obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o -# WiFi +# WiFi + BT obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o +obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o # IPC Devices obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o @@ -26,5 +27,6 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o +obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_mrfld_power_btn.o obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c new file mode 100644 index 000000000000..5a0483e7bf66 --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -0,0 +1,108 @@ +/* + * Bluetooth platform data initialization file + * + * (C) Copyright 2017 Intel Corporation + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gpio/machine.h> +#include <linux/pci.h> +#include <linux/platform_device.h> + +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/intel-mid.h> + +struct bt_sfi_data { + struct device *dev; + const char *name; + int (*setup)(struct bt_sfi_data *ddata); +}; + +static struct gpiod_lookup_table tng_bt_sfi_gpio_table = { + .dev_id = "hci_bcm", + .table = { + GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup", GPIO_ACTIVE_HIGH), + { }, + }, +}; + +#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP "bt_wakeup" +#define TNG_BT_SFI_GPIO_SHUTDOWN "BT-reset" +#define TNG_BT_SFI_GPIO_HOST_WAKEUP "bt_uart_enable" + +static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) +{ + struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table; + struct gpiod_lookup *lookup = table->table; + struct pci_dev *pdev; + + /* Connected to /dev/ttyS0 */ + pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1)); + if (!pdev) + return -ENODEV; + + ddata->dev = &pdev->dev; + ddata->name = table->dev_id; + + lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP); + lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN); + lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP); + + gpiod_add_lookup_table(table); + return 0; +} + +static struct bt_sfi_data tng_bt_sfi_data __initdata = { + .setup = tng_bt_sfi_setup, +}; + +#define ICPU(model, ddata) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } + +static const struct x86_cpu_id bt_sfi_cpu_ids[] = { + ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data), + {} +}; + +static int __init bt_sfi_init(void) +{ + struct platform_device_info info; + struct platform_device *pdev; + const struct x86_cpu_id *id; + struct bt_sfi_data *ddata; + int ret; + + id = x86_match_cpu(bt_sfi_cpu_ids); + if (!id) + return -ENODEV; + + ddata = (struct bt_sfi_data *)id->driver_data; + if (!ddata) + return -ENODEV; + + ret = ddata->setup(ddata); + if (ret) + return ret; + + memset(&info, 0, sizeof(info)); + info.fwnode = ddata->dev->fwnode; + info.parent = ddata->dev; + info.name = ddata->name, + info.id = PLATFORM_DEVID_NONE, + + pdev = platform_device_register_full(&info); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name); + return 0; +} +device_initcall(bt_sfi_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c new file mode 100644 index 000000000000..a6c3705a28ad --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c @@ -0,0 +1,82 @@ +/* + * Intel Merrifield power button support + * + * (C) Copyright 2017 Intel Corporation + * + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/platform_device.h> +#include <linux/sfi.h> + +#include <asm/intel-mid.h> +#include <asm/intel_scu_ipc.h> + +static struct resource mrfld_power_btn_resources[] = { + { + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mrfld_power_btn_dev = { + .name = "msic_power_btn", + .id = PLATFORM_DEVID_NONE, + .num_resources = ARRAY_SIZE(mrfld_power_btn_resources), + .resource = mrfld_power_btn_resources, +}; + +static int mrfld_power_btn_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) +{ + if (code == SCU_DOWN) { + platform_device_unregister(&mrfld_power_btn_dev); + return 0; + } + + return platform_device_register(&mrfld_power_btn_dev); +} + +static struct notifier_block mrfld_power_btn_scu_notifier = { + .notifier_call = mrfld_power_btn_scu_status_change, +}; + +static int __init register_mrfld_power_btn(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + /* + * We need to be sure that the SCU IPC is ready before + * PMIC power button device can be registered: + */ + intel_scu_notifier_add(&mrfld_power_btn_scu_notifier); + + return 0; +} +arch_initcall(register_mrfld_power_btn); + +static void __init *mrfld_power_btn_platform_data(void *info) +{ + struct resource *res = mrfld_power_btn_resources; + struct sfi_device_table_entry *pentry = info; + + res->start = res->end = pentry->irq; + return NULL; +} + +static const struct devs_id mrfld_power_btn_dev_id __initconst = { + .name = "bcove_power_btn", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .msic = 1, + .get_platform_data = &mrfld_power_btn_platform_data, +}; + +sfi_device(mrfld_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 86edd1e941eb..9e304e2ea4f5 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -19,7 +19,7 @@ #include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> -#define TANGIER_EXT_TIMER0_MSI 15 +#define TANGIER_EXT_TIMER0_MSI 12 static struct platform_device wdt_dev = { .name = "intel_mid_wdt", diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index e793fe509971..e42978d4deaf 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -17,16 +17,6 @@ #include "intel_mid_weak_decls.h" -static void penwell_arch_setup(void); -/* penwell arch ops */ -static struct intel_mid_ops penwell_ops = { - .arch_setup = penwell_arch_setup, -}; - -static void mfld_power_off(void) -{ -} - static unsigned long __init mfld_calibrate_tsc(void) { unsigned long fast_calibrate; @@ -63,9 +53,12 @@ static unsigned long __init mfld_calibrate_tsc(void) static void __init penwell_arch_setup(void) { x86_platform.calibrate_tsc = mfld_calibrate_tsc; - pm_power_off = mfld_power_off; } +static struct intel_mid_ops penwell_ops = { + .arch_setup = penwell_arch_setup, +}; + void *get_penwell_ops(void) { return &penwell_ops; diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index edf2c54bf131..a952ac199741 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -34,6 +34,8 @@ static struct pci_dev *mbi_pdev; static DEFINE_SPINLOCK(iosf_mbi_lock); +static DEFINE_MUTEX(iosf_mbi_punit_mutex); +static BLOCKING_NOTIFIER_HEAD(iosf_mbi_pmic_bus_access_notifier); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) { @@ -190,6 +192,53 @@ bool iosf_mbi_available(void) } EXPORT_SYMBOL(iosf_mbi_available); +void iosf_mbi_punit_acquire(void) +{ + mutex_lock(&iosf_mbi_punit_mutex); +} +EXPORT_SYMBOL(iosf_mbi_punit_acquire); + +void iosf_mbi_punit_release(void) +{ + mutex_unlock(&iosf_mbi_punit_mutex); +} +EXPORT_SYMBOL(iosf_mbi_punit_release); + +int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb) +{ + int ret; + + /* Wait for the bus to go inactive before registering */ + mutex_lock(&iosf_mbi_punit_mutex); + ret = blocking_notifier_chain_register( + &iosf_mbi_pmic_bus_access_notifier, nb); + mutex_unlock(&iosf_mbi_punit_mutex); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_register_pmic_bus_access_notifier); + +int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) +{ + int ret; + + /* Wait for the bus to go inactive before unregistering */ + mutex_lock(&iosf_mbi_punit_mutex); + ret = blocking_notifier_chain_unregister( + &iosf_mbi_pmic_bus_access_notifier, nb); + mutex_unlock(&iosf_mbi_punit_mutex); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier); + +int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v) +{ + return blocking_notifier_call_chain( + &iosf_mbi_pmic_bus_access_notifier, val, v); +} +EXPORT_SYMBOL(iosf_mbi_call_pmic_bus_access_notifier_chain); + #ifdef CONFIG_IOSF_MBI_DEBUG static u32 dbg_mdr; static u32 dbg_mcr; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 766d4d3529a1..42e65fee5673 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -23,28 +23,7 @@ #include <asm/irq_vectors.h> #include <asm/timer.h> -static struct bau_operations ops; - -static struct bau_operations uv123_bau_ops = { - .bau_gpa_to_offset = uv_gpa_to_offset, - .read_l_sw_ack = read_mmr_sw_ack, - .read_g_sw_ack = read_gmmr_sw_ack, - .write_l_sw_ack = write_mmr_sw_ack, - .write_g_sw_ack = write_gmmr_sw_ack, - .write_payload_first = write_mmr_payload_first, - .write_payload_last = write_mmr_payload_last, -}; - -static struct bau_operations uv4_bau_ops = { - .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram, - .read_l_sw_ack = read_mmr_proc_sw_ack, - .read_g_sw_ack = read_gmmr_proc_sw_ack, - .write_l_sw_ack = write_mmr_proc_sw_ack, - .write_g_sw_ack = write_gmmr_proc_sw_ack, - .write_payload_first = write_mmr_proc_payload_first, - .write_payload_last = write_mmr_proc_payload_last, -}; - +static struct bau_operations ops __ro_after_init; /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ static int timeout_base_ns[] = { @@ -548,11 +527,12 @@ static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift) * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP */ static int uv1_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_status; cycles_t ttm; + u64 mmr_offset = bcp->status_mmr; + int right_shift = bcp->status_index; struct ptc_stats *stat = bcp->statp; descriptor_status = uv1_read_status(mmr_offset, right_shift); @@ -640,11 +620,12 @@ int handle_uv2_busy(struct bau_control *bcp) } static int uv2_3_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_stat; cycles_t ttm; + u64 mmr_offset = bcp->status_mmr; + int right_shift = bcp->status_index; int desc = bcp->uvhub_cpu; long busy_reps = 0; struct ptc_stats *stat = bcp->statp; @@ -706,28 +687,59 @@ static int uv2_3_wait_completion(struct bau_desc *bau_desc, } /* - * There are 2 status registers; each and array[32] of 2 bits. Set up for - * which register to read and position in that register based on cpu in - * current hub. + * Returns the status of current BAU message for cpu desc as a bit field + * [Error][Busy][Aux] */ -static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try) +static u64 read_status(u64 status_mmr, int index, int desc) { - int right_shift; - unsigned long mmr_offset; + u64 stat; + + stat = ((read_lmmr(status_mmr) >> index) & UV_ACT_STATUS_MASK) << 1; + stat |= (read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_2) >> desc) & 0x1; + + return stat; +} + +static int uv4_wait_completion(struct bau_desc *bau_desc, + struct bau_control *bcp, long try) +{ + struct ptc_stats *stat = bcp->statp; + u64 descriptor_stat; + u64 mmr = bcp->status_mmr; + int index = bcp->status_index; int desc = bcp->uvhub_cpu; - if (desc < UV_CPUS_PER_AS) { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = desc * UV_ACT_STATUS_SIZE; - } else { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); - } + descriptor_stat = read_status(mmr, index, desc); - if (bcp->uvhub_version == 1) - return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); - else - return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); + /* spin on the status MMR, waiting for it to go idle */ + while (descriptor_stat != UV2H_DESC_IDLE) { + switch (descriptor_stat) { + case UV2H_DESC_SOURCE_TIMEOUT: + stat->s_stimeout++; + return FLUSH_GIVEUP; + + case UV2H_DESC_DEST_TIMEOUT: + stat->s_dtimeout++; + bcp->conseccompletes = 0; + return FLUSH_RETRY_TIMEOUT; + + case UV2H_DESC_DEST_STRONG_NACK: + stat->s_plugged++; + bcp->conseccompletes = 0; + return FLUSH_RETRY_PLUGGED; + + case UV2H_DESC_DEST_PUT_ERR: + bcp->conseccompletes = 0; + return FLUSH_GIVEUP; + + default: + /* descriptor_stat is still BUSY */ + cpu_relax(); + } + descriptor_stat = read_status(mmr, index, desc); + } + bcp->conseccompletes++; + return FLUSH_COMPLETE; } /* @@ -918,7 +930,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_3_bau_msg_header *uv2_3_hdr = NULL; - if (bcp->uvhub_version == 1) { + if (bcp->uvhub_version == UV_BAU_V1) { uv1 = 1; uv1_throttle(hmaster, stat); } @@ -958,7 +970,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, write_mmr_activation(index); try++; - completion_stat = wait_completion(bau_desc, bcp, try); + completion_stat = ops.wait_completion(bau_desc, bcp, try); handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); @@ -1114,15 +1126,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, unsigned long end, unsigned int cpu) { - int locals = 0; - int remotes = 0; - int hubs = 0; + int locals = 0, remotes = 0, hubs = 0; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; - unsigned long descriptor_status; - unsigned long status; + unsigned long descriptor_status, status, address; bcp = &per_cpu(bau_control, cpu); @@ -1171,10 +1180,24 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, record_send_statistics(stat, locals, hubs, remotes, bau_desc); if (!end || (end - start) <= PAGE_SIZE) - bau_desc->payload.address = start; + address = start; else - bau_desc->payload.address = TLB_FLUSH_ALL; - bau_desc->payload.sending_cpu = cpu; + address = TLB_FLUSH_ALL; + + switch (bcp->uvhub_version) { + case UV_BAU_V1: + case UV_BAU_V2: + case UV_BAU_V3: + bau_desc->payload.uv1_2_3.address = address; + bau_desc->payload.uv1_2_3.sending_cpu = cpu; + break; + case UV_BAU_V4: + bau_desc->payload.uv4.address = address; + bau_desc->payload.uv4.sending_cpu = cpu; + bau_desc->payload.uv4.qualifier = BAU_DESC_QUALIFIER; + break; + } + /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. @@ -1296,7 +1319,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) msgdesc.msg_slot = msg - msgdesc.queue_first; msgdesc.msg = msg; - if (bcp->uvhub_version == 2) + if (bcp->uvhub_version == UV_BAU_V2) process_uv2_message(&msgdesc, bcp); else /* no error workaround for uv1 or uv3 */ @@ -1838,7 +1861,7 @@ static void pq_init(int node, int pnode) * and the payload queue tail must be maintained by the kernel. */ bcp = &per_cpu(bau_control, smp_processor_id()); - if (bcp->uvhub_version <= 3) { + if (bcp->uvhub_version <= UV_BAU_V3) { tail = first; gnode = uv_gpa_to_gnode(uv_gpa(pqp)); first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail; @@ -1847,7 +1870,6 @@ static void pq_init(int node, int pnode) ops.write_payload_first(pnode, first); ops.write_payload_last(pnode, last); - ops.write_g_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); @@ -2035,8 +2057,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, struct bau_control **smasterp, struct bau_control **hmasterp) { - int i; - int cpu; + int i, cpu, uvhub_cpu; struct bau_control *bcp; for (i = 0; i < sdp->num_cpus; i++) { @@ -2053,19 +2074,33 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->socket_master = *smasterp; bcp->uvhub = bdp->uvhub; if (is_uv1_hub()) - bcp->uvhub_version = 1; + bcp->uvhub_version = UV_BAU_V1; else if (is_uv2_hub()) - bcp->uvhub_version = 2; + bcp->uvhub_version = UV_BAU_V2; else if (is_uv3_hub()) - bcp->uvhub_version = 3; + bcp->uvhub_version = UV_BAU_V3; else if (is_uv4_hub()) - bcp->uvhub_version = 4; + bcp->uvhub_version = UV_BAU_V4; else { pr_emerg("uvhub version not 1, 2, 3, or 4\n"); return 1; } bcp->uvhub_master = *hmasterp; - bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu); + uvhub_cpu = uv_cpu_blade_processor_id(cpu); + bcp->uvhub_cpu = uvhub_cpu; + + /* + * The ERROR and BUSY status registers are located pairwise over + * the STATUS_0 and STATUS_1 mmrs; each an array[32] of 2 bits. + */ + if (uvhub_cpu < UV_CPUS_PER_AS) { + bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + bcp->status_index = uvhub_cpu * UV_ACT_STATUS_SIZE; + } else { + bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + bcp->status_index = (uvhub_cpu - UV_CPUS_PER_AS) + * UV_ACT_STATUS_SIZE; + } if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { pr_emerg("%d cpus per uvhub invalid\n", @@ -2148,6 +2183,39 @@ fail: return 1; } +static const struct bau_operations uv1_bau_ops __initconst = { + .bau_gpa_to_offset = uv_gpa_to_offset, + .read_l_sw_ack = read_mmr_sw_ack, + .read_g_sw_ack = read_gmmr_sw_ack, + .write_l_sw_ack = write_mmr_sw_ack, + .write_g_sw_ack = write_gmmr_sw_ack, + .write_payload_first = write_mmr_payload_first, + .write_payload_last = write_mmr_payload_last, + .wait_completion = uv1_wait_completion, +}; + +static const struct bau_operations uv2_3_bau_ops __initconst = { + .bau_gpa_to_offset = uv_gpa_to_offset, + .read_l_sw_ack = read_mmr_sw_ack, + .read_g_sw_ack = read_gmmr_sw_ack, + .write_l_sw_ack = write_mmr_sw_ack, + .write_g_sw_ack = write_gmmr_sw_ack, + .write_payload_first = write_mmr_payload_first, + .write_payload_last = write_mmr_payload_last, + .wait_completion = uv2_3_wait_completion, +}; + +static const struct bau_operations uv4_bau_ops __initconst = { + .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram, + .read_l_sw_ack = read_mmr_proc_sw_ack, + .read_g_sw_ack = read_gmmr_proc_sw_ack, + .write_l_sw_ack = write_mmr_proc_sw_ack, + .write_g_sw_ack = write_gmmr_proc_sw_ack, + .write_payload_first = write_mmr_proc_payload_first, + .write_payload_last = write_mmr_proc_payload_last, + .wait_completion = uv4_wait_completion, +}; + /* * Initialization of BAU-related structures */ @@ -2167,11 +2235,11 @@ static int __init uv_bau_init(void) if (is_uv4_hub()) ops = uv4_bau_ops; else if (is_uv3_hub()) - ops = uv123_bau_ops; + ops = uv2_3_bau_ops; else if (is_uv2_hub()) - ops = uv123_bau_ops; + ops = uv2_3_bau_ops; else if (is_uv1_hub()) - ops = uv123_bau_ops; + ops = uv1_bau_ops; for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 2ee7632d4916..b082d71b08ee 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -390,9 +390,11 @@ static __init int uv_rtc_setup_clock(void) clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / sn_rtc_cycles_per_second; + clock_event_device_uv.min_delta_ticks = 1; clock_event_device_uv.max_delta_ns = clocksource_uv.mask * (NSEC_PER_SEC / sn_rtc_cycles_per_second); + clock_event_device_uv.max_delta_ticks = clocksource_uv.mask; rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 66ade16c7693..6b05a9219ea2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt) * 'pmode_gdt' in wakeup_start. */ ctxt->gdt_desc.size = GDT_SIZE - 1; - ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); + ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); store_tr(ctxt->tr); @@ -162,7 +162,7 @@ static void fix_processor_context(void) int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 - struct desc_struct *desc = get_cpu_gdt_table(cpu); + struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif set_tss_desc(cpu, t); /* @@ -183,6 +183,9 @@ static void fix_processor_context(void) load_mm_ldt(current->active_mm); /* This does lldt */ fpu__resume_cpu(); + + /* The processor is back on the direct GDT, load back the fixmap */ + load_fixmap_gdt(cpu); } /** diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..c35fdb585c68 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; */ static pmd_t *resume_one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) return NULL; set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); #else - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); #endif diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ded2e8272382..6a61194ffd58 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -16,6 +16,7 @@ #include <crypto/hash.h> +#include <asm/e820/api.h> #include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> @@ -49,6 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; + p4d_t *p4d; /* * The new mapping only has to cover the page containing the image @@ -63,6 +65,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * the virtual address space after switching over to the original page * tables used by the image kernel. */ + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!p4d) + return -ENOMEM; + } + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; @@ -75,8 +84,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), - __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* No p4d for 4-level paging: point the pgd to the pud page table */ + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); + } return 0; } @@ -124,7 +138,10 @@ static int set_up_temporary_mappings(void) static int relocate_restore_code(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) @@ -134,22 +151,25 @@ static int relocate_restore_code(void) /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); - pud = pud_offset(pgd, relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); if (pud_large(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - } else { - pmd_t *pmd = pmd_offset(pud, relocated_restore_code); - - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - } else { - pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); - } + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: __flush_tlb_all(); - return 0; } @@ -195,12 +215,12 @@ struct restore_data_record { #if IS_BUILTIN(CONFIG_CRYPTO_MD5) /** - * get_e820_md5 - calculate md5 according to given e820 map + * get_e820_md5 - calculate md5 according to given e820 table * - * @map: the e820 map to be calculated + * @table: the e820 table to be calculated * @buf: the md5 result to be stored to */ -static int get_e820_md5(struct e820map *map, void *buf) +static int get_e820_md5(struct e820_table *table, void *buf) { struct scatterlist sg; struct crypto_ahash *tfm; @@ -213,10 +233,9 @@ static int get_e820_md5(struct e820map *map, void *buf) { AHASH_REQUEST_ON_STACK(req, tfm); - size = offsetof(struct e820map, map) - + sizeof(struct e820entry) * map->nr_map; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; ahash_request_set_tfm(req, tfm); - sg_init_one(&sg, (u8 *)map, size); + sg_init_one(&sg, (u8 *)table, size); ahash_request_set_callback(req, 0, NULL, NULL); ahash_request_set_crypt(req, &sg, buf, size); @@ -231,7 +250,7 @@ static int get_e820_md5(struct e820map *map, void *buf) static void hibernation_e820_save(void *buf) { - get_e820_md5(e820_saved, buf); + get_e820_md5(e820_table_firmware, buf); } static bool hibernation_e820_mismatch(void *buf) @@ -244,7 +263,7 @@ static bool hibernation_e820_mismatch(void *buf) if (!memcmp(result, buf, MD5_DIGEST_SIZE)) return false; - ret = get_e820_md5(e820_saved, result); + ret = get_e820_md5(e820_table_firmware, result); if (ret) return true; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 555b9fa0ad43..7dbdb780264d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 25e068ba3382..470edad96bb9 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -10,21 +10,19 @@ * Version 2. See the file COPYING for more details. */ +#include <linux/bug.h> +#include <asm/purgatory.h> + #include "sha256.h" #include "../boot/string.h" -struct sha_region { - unsigned long start; - unsigned long len; -}; - -unsigned long backup_dest = 0; -unsigned long backup_src = 0; -unsigned long backup_sz = 0; +unsigned long purgatory_backup_dest __section(.kexec-purgatory); +unsigned long purgatory_backup_src __section(.kexec-purgatory); +unsigned long purgatory_backup_sz __section(.kexec-purgatory); -u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; +u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); -struct sha_region sha_regions[16] = {}; +struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); /* * On x86, second kernel requries first 640K of memory to boot. Copy @@ -33,26 +31,28 @@ struct sha_region sha_regions[16] = {}; */ static int copy_backup_region(void) { - if (backup_dest) - memcpy((void *)backup_dest, (void *)backup_src, backup_sz); - + if (purgatory_backup_dest) { + memcpy((void *)purgatory_backup_dest, + (void *)purgatory_backup_src, purgatory_backup_sz); + } return 0; } -int verify_sha256_digest(void) +static int verify_sha256_digest(void) { - struct sha_region *ptr, *end; + struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; struct sha256_state sctx; sha256_init(&sctx); - end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; - for (ptr = sha_regions; ptr < end; ptr++) + end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); + + for (ptr = purgatory_sha_regions; ptr < end; ptr++) sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); sha256_final(&sctx, digest); - if (memcmp(digest, sha256_digest, sizeof(digest))) + if (memcmp(digest, purgatory_sha256_digest, sizeof(digest))) return 1; return 0; diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S index fe3c91ba1bd0..dfae9b9e60b5 100644 --- a/arch/x86/purgatory/setup-x86_64.S +++ b/arch/x86/purgatory/setup-x86_64.S @@ -9,6 +9,7 @@ * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ +#include <asm/purgatory.h> .text .globl purgatory_start diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h index bd15a4127735..2867d9825a57 100644 --- a/arch/x86/purgatory/sha256.h +++ b/arch/x86/purgatory/sha256.h @@ -10,7 +10,6 @@ #ifndef SHA256_H #define SHA256_H - #include <linux/types.h> #include <crypto/sha.h> diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 0bc60a308730..2a2d89d39af6 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -7,3 +7,17 @@ config MCE_AMD_INJ aspects of the MCE handling code. WARNING: Do not even assume this interface is staying stable! + +config RAS_CEC + bool "Correctable Errors Collector" + depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS + ---help--- + This is a small cache which collects correctable memory errors per 4K + page PFN and counts their repeated occurrence. Once the counter for a + PFN overflows, we try to soft-offline that page as we take it to mean + that it has reached a relatively high error count and would probably + be best if we don't use it anymore. + + Bear in mind that this is absolutely useless if your platform doesn't + have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. + diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 5db706f14111..a163a90af4aa 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -2,7 +2,7 @@ #include <linux/slab.h> #include <linux/memblock.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/pgtable.h> #include <asm/realmode.h> #include <asm/tlbflush.h> diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index e7e7055a8658..46cbbfe03285 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -8,7 +8,7 @@ else BITS := 64 endif -obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \ +obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_$(BITS).o stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ @@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \ ifeq ($(CONFIG_X86_32),y) -obj-y += checksum_32.o +obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index e59eef20647b..b291ca5cf66b 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -78,7 +78,7 @@ static inline int ptrace_set_thread_area(struct task_struct *child, int idx, return -ENOSYS; } -extern long arch_prctl(struct task_struct *task, int code, +extern long arch_prctl(struct task_struct *task, int option, unsigned long __user *addr); #endif diff --git a/arch/x86/um/bug.c b/arch/x86/um/bug.c deleted file mode 100644 index e8034e363d83..000000000000 --- a/arch/x86/um/bug.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com) - * Licensed under the GPL V2 - */ - -#include <linux/uaccess.h> - -/* - * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because - * that's not relevant in skas mode. - */ - -int is_valid_bugaddr(unsigned long eip) -{ - unsigned short ud2; - - if (probe_kernel_address((unsigned short __user *)eip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c index 96eb2bd28832..8431e87ac333 100644 --- a/arch/x86/um/os-Linux/prctl.c +++ b/arch/x86/um/os-Linux/prctl.c @@ -6,7 +6,7 @@ #include <sys/ptrace.h> #include <asm/ptrace.h> -int os_arch_prctl(int pid, int code, unsigned long *addr) +int os_arch_prctl(int pid, int option, unsigned long *arg2) { - return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code); + return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option); } diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c new file mode 100644 index 000000000000..627d68836b16 --- /dev/null +++ b/arch/x86/um/syscalls_32.c @@ -0,0 +1,7 @@ +#include <linux/syscalls.h> +#include <os.h> + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return -EINVAL; +} diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index 10d907098c26..58f51667e2e4 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -7,13 +7,15 @@ #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/syscalls.h> #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> -long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) +long arch_prctl(struct task_struct *task, int option, + unsigned long __user *arg2) { - unsigned long *ptr = addr, tmp; + unsigned long *ptr = arg2, tmp; long ret; int pid = task->mm->context.id.u.pid; @@ -30,7 +32,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) * arch_prctl is run on the host, then the registers are read * back. */ - switch (code) { + switch (option) { case ARCH_SET_FS: case ARCH_SET_GS: ret = restore_registers(pid, ¤t->thread.regs.regs); @@ -50,11 +52,11 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) ptr = &tmp; } - ret = os_arch_prctl(pid, code, ptr); + ret = os_arch_prctl(pid, option, ptr); if (ret) return ret; - switch (code) { + switch (option) { case ARCH_SET_FS: current->thread.arch.fs = (unsigned long) ptr; ret = save_registers(pid, ¤t->thread.regs.regs); @@ -63,19 +65,19 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) ret = save_registers(pid, ¤t->thread.regs.regs); break; case ARCH_GET_FS: - ret = put_user(tmp, addr); + ret = put_user(tmp, arg2); break; case ARCH_GET_GS: - ret = put_user(tmp, addr); + ret = put_user(tmp, arg2); break; } return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return arch_prctl(current, code, (unsigned long __user *) addr); + return arch_prctl(current, option, (unsigned long __user *) arg2); } void arch_switch_to(struct task_struct *to) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 76b6dbd627df..027987638e98 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,6 @@ config XEN bool "Xen guest support" depends on PARAVIRT select PARAVIRT_CLOCK - select XEN_HAVE_PVMMU - select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -15,18 +13,41 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. -config XEN_DOM0 +config XEN_PV + bool "Xen PV guest support" + default y + depends on XEN + select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU + help + Support running as a Xen PV guest. + +config XEN_PV_SMP def_bool y - depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on XEN_PV && SMP + +config XEN_DOM0 + bool "Xen PV Dom0 support" + default y + depends on XEN_PV && PCI_XEN && SWIOTLB_XEN depends on X86_IO_APIC && ACPI && PCI + help + Support running as a Xen PV Dom0 guest. config XEN_PVHVM - def_bool y + bool "Xen PVHVM guest support" + default y depends on XEN && PCI && X86_LOCAL_APIC + help + Support running as a Xen PVHVM guest. + +config XEN_PVHVM_SMP + def_bool y + depends on XEN_PVHVM && SMP config XEN_512GB bool "Limit Xen pv-domain memory to 512GB" - depends on XEN && X86_64 + depends on XEN_PV && X86_64 default y help Limit paravirtualized user domains to 512GB of RAM. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index cb0164aee156..fffb0a16f9e3 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -7,17 +7,23 @@ endif # Make sure early boot has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_enlighten.o := $(nostackp) -CFLAGS_mmu.o := $(nostackp) +CFLAGS_enlighten_pv.o := $(nostackp) +CFLAGS_mmu_pv.o := $(nostackp) -obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ +obj-y := enlighten.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o pmu.o + grant-table.o suspend.o platform-pci-unplug.o + +obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o +obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \ + p2m.o enlighten_pv.o mmu_pv.o +obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o +obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 3be012115853..30bb2e80cfe7 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -81,7 +81,7 @@ static const struct efi efi_xen __initconst = { .update_capsule = xen_efi_update_capsule, .query_capsule_caps = xen_efi_query_capsule_caps, .get_next_high_mono_count = xen_efi_get_next_high_mono_count, - .reset_system = NULL, /* Functionality provided by Xen. */ + .reset_system = xen_efi_reset_system, .set_virtual_address_map = NULL, /* Not used under Xen. */ .flags = 0 /* Initialized later. */ }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ec1d5c46e58f..a5ffcbb20cc0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,94 +1,16 @@ -/* - * Core of Xen paravirt_ops implementation. - * - * This file contains the xen_paravirt_ops structure itself, and the - * implementations for: - * - privileged instructions - * - interrupt flags - * - segment operations - * - booting and setup - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - #include <linux/cpu.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/preempt.h> -#include <linux/hardirq.h> -#include <linux/percpu.h> -#include <linux/delay.h> -#include <linux/start_kernel.h> -#include <linux/sched.h> -#include <linux/kprobes.h> -#include <linux/bootmem.h> -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/highmem.h> -#include <linux/console.h> -#include <linux/pci.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/edd.h> -#include <linux/frame.h> - #include <linux/kexec.h> -#include <xen/xen.h> -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/version.h> -#include <xen/interface/physdev.h> -#include <xen/interface/vcpu.h> -#include <xen/interface/memory.h> -#include <xen/interface/nmi.h> -#include <xen/interface/xen-mca.h> -#include <xen/interface/hvm/start_info.h> #include <xen/features.h> #include <xen/page.h> -#include <xen/hvm.h> -#include <xen/hvc-console.h> -#include <xen/acpi.h> -#include <asm/paravirt.h> -#include <asm/apic.h> -#include <asm/page.h> -#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> -#include <asm/xen/cpuid.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/proto.h> -#include <asm/msr-index.h> -#include <asm/traps.h> -#include <asm/setup.h> -#include <asm/desc.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/reboot.h> -#include <asm/stackprotector.h> -#include <asm/hypervisor.h> -#include <asm/mach_traps.h> -#include <asm/mwait.h> -#include <asm/pci_x86.h> #include <asm/cpu.h> - -#ifdef CONFIG_ACPI -#include <linux/acpi.h> -#include <asm/acpi.h> -#include <acpi/pdc_intel.h> -#include <acpi/processor.h> -#include <xen/interface/platform.h> -#endif +#include <asm/e820/api.h> #include "xen-ops.h" -#include "mmu.h" #include "smp.h" -#include "multicalls.h" #include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -135,13 +57,8 @@ EXPORT_SYMBOL_GPL(xen_start_info); struct shared_info xen_dummy_shared_info; -void *xen_initial_gdt; - -RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); - -static int xen_cpu_up_prepare(unsigned int cpu); -static int xen_cpu_up_online(unsigned int cpu); -static int xen_cpu_dead(unsigned int cpu); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* * Point at some empty memory to start with. We map the real shared_info @@ -162,34 +79,32 @@ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +int xen_have_vcpu_info_placement = 1; -struct tls_descs { - struct desc_struct desc[3]; -}; +static int xen_cpu_up_online(unsigned int cpu) +{ + xen_init_lock_cpu(cpu); + return 0; +} -/* - * Updating the 3 TLS descriptors in the GDT on every task switch is - * surprisingly expensive so we avoid updating them if they haven't - * changed. Since Xen writes different descriptors than the one - * passed in the update_descriptor hypercall we keep shadow copies to - * compare against. - */ -static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), + int (*cpu_dead_cb)(unsigned int)) +{ + int rc; -#ifdef CONFIG_XEN_PVH -/* - * PVH variables. - * - * xen_pvh and pvh_bootparams need to live in data segment since they - * are used after startup_{32|64}, which clear .bss, are invoked. - */ -bool xen_pvh __attribute__((section(".data"))) = 0; -struct boot_params pvh_bootparams __attribute__((section(".data"))); + rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, + "x86/xen/hvm_guest:prepare", + cpu_up_prepare_cb, cpu_dead_cb); + if (rc >= 0) { + rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/xen/hvm_guest:online", + xen_cpu_up_online, NULL); + if (rc < 0) + cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); + } -struct hvm_start_info pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); -#endif + return rc >= 0 ? 0 : rc; +} static void clamp_max_cpus(void) { @@ -226,7 +141,7 @@ void xen_vcpu_setup(int cpu) per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - if (!have_vcpu_info_placement) { + if (!xen_have_vcpu_info_placement) { if (cpu >= MAX_VIRT_CPUS) clamp_max_cpus(); return; @@ -249,7 +164,7 @@ void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - have_vcpu_info_placement = 0; + xen_have_vcpu_info_placement = 0; clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if @@ -258,1043 +173,7 @@ void xen_vcpu_setup(int cpu) } } -/* - * On restore, set the vcpu placement up again. - * If it fails, then we're in a bad state, since - * we can't back out from using it... - */ -void xen_vcpu_restore(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) - BUG(); - - xen_setup_runstate_info(cpu); - - if (have_vcpu_info_placement) - xen_vcpu_setup(cpu); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) - BUG(); - } -} - -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel %son %s\n", - xen_feature(XENFEAT_auto_translated_physmap) ? - "with PVH extensions " : "", pv_info.name); - printk(KERN_INFO "Xen version: %d.%d%s%s\n", - version >> 16, version & 0xffff, extra.extraversion, - xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); -} -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - -#define CPUID_THERM_POWER_LEAF 6 -#define APERFMPERF_PRESENT 0 - -static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; -static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; - -static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; -static __read_mostly unsigned int cpuid_leaf5_ecx_val; -static __read_mostly unsigned int cpuid_leaf5_edx_val; - -static void xen_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - unsigned maskebx = ~0; - unsigned maskecx = ~0; - unsigned maskedx = ~0; - unsigned setecx = 0; - /* - * Mask out inconvenient features, to try and disable as many - * unsupported kernel subsystems as possible. - */ - switch (*ax) { - case 1: - maskecx = cpuid_leaf1_ecx_mask; - setecx = cpuid_leaf1_ecx_set_mask; - maskedx = cpuid_leaf1_edx_mask; - break; - - case CPUID_MWAIT_LEAF: - /* Synthesize the values.. */ - *ax = 0; - *bx = 0; - *cx = cpuid_leaf5_ecx_val; - *dx = cpuid_leaf5_edx_val; - return; - - case CPUID_THERM_POWER_LEAF: - /* Disabling APERFMPERF for kernel usage */ - maskecx = ~(1 << APERFMPERF_PRESENT); - break; - - case 0xb: - /* Suppress extended topology stuff */ - maskebx = 0; - break; - } - - asm(XEN_EMULATE_PREFIX "cpuid" - : "=a" (*ax), - "=b" (*bx), - "=c" (*cx), - "=d" (*dx) - : "0" (*ax), "2" (*cx)); - - *bx &= maskebx; - *cx &= maskecx; - *cx |= setecx; - *dx &= maskedx; -} -STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ - -static bool __init xen_check_mwait(void) -{ -#ifdef CONFIG_ACPI - struct xen_platform_op op = { - .cmd = XENPF_set_processor_pminfo, - .u.set_pminfo.id = -1, - .u.set_pminfo.type = XEN_PM_PDC, - }; - uint32_t buf[3]; - unsigned int ax, bx, cx, dx; - unsigned int mwait_mask; - - /* We need to determine whether it is OK to expose the MWAIT - * capability to the kernel to harvest deeper than C3 states from ACPI - * _CST using the processor_harvest_xen.c module. For this to work, we - * need to gather the MWAIT_LEAF values (which the cstate.c code - * checks against). The hypervisor won't expose the MWAIT flag because - * it would break backwards compatibility; so we will find out directly - * from the hardware and hypercall. - */ - if (!xen_initial_domain()) - return false; - - /* - * When running under platform earlier than Xen4.2, do not expose - * mwait, to avoid the risk of loading native acpi pad driver - */ - if (!xen_running_on_version_or_later(4, 2)) - return false; - - ax = 1; - cx = 0; - - native_cpuid(&ax, &bx, &cx, &dx); - - mwait_mask = (1 << (X86_FEATURE_EST % 32)) | - (1 << (X86_FEATURE_MWAIT % 32)); - - if ((cx & mwait_mask) != mwait_mask) - return false; - - /* We need to emulate the MWAIT_LEAF and for that we need both - * ecx and edx. The hypercall provides only partial information. - */ - - ax = CPUID_MWAIT_LEAF; - bx = 0; - cx = 0; - dx = 0; - - native_cpuid(&ax, &bx, &cx, &dx); - - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, - * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. - */ - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); - - set_xen_guest_handle(op.u.set_pminfo.pdc, buf); - - if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { - cpuid_leaf5_ecx_val = cx; - cpuid_leaf5_edx_val = dx; - } - return true; -#else - return false; -#endif -} -static void __init xen_init_cpuid_mask(void) -{ - unsigned int ax, bx, cx, dx; - unsigned int xsave_mask; - - cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ - - if (!xen_initial_domain()) - cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ - - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); - - ax = 1; - cx = 0; - cpuid(1, &ax, &bx, &cx, &dx); - - xsave_mask = - (1 << (X86_FEATURE_XSAVE % 32)) | - (1 << (X86_FEATURE_OSXSAVE % 32)); - - /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ - if ((cx & xsave_mask) != xsave_mask) - cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ - if (xen_check_mwait()) - cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); -} - -static void xen_set_debugreg(int reg, unsigned long val) -{ - HYPERVISOR_set_debugreg(reg, val); -} - -static unsigned long xen_get_debugreg(int reg) -{ - return HYPERVISOR_get_debugreg(reg); -} - -static void xen_end_context_switch(struct task_struct *next) -{ - xen_mc_flush(); - paravirt_end_context_switch(next); -} - -static unsigned long xen_store_tr(void) -{ - return 0; -} - -/* - * Set the page permissions for a particular virtual address. If the - * address is a vmalloc mapping (or other non-linear mapping), then - * find the linear mapping of the page and also set its protections to - * match. - */ -static void set_aliased_prot(void *v, pgprot_t prot) -{ - int level; - pte_t *ptep; - pte_t pte; - unsigned long pfn; - struct page *page; - unsigned char dummy; - - ptep = lookup_address((unsigned long)v, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - page = pfn_to_page(pfn); - - pte = pfn_pte(pfn, prot); - - /* - * Careful: update_va_mapping() will fail if the virtual address - * we're poking isn't populated in the page tables. We don't - * need to worry about the direct map (that's always in the page - * tables), but we need to be careful about vmap space. In - * particular, the top level page table can lazily propagate - * entries between processes, so if we've switched mms since we - * vmapped the target in the first place, we might not have the - * top-level page table entry populated. - * - * We disable preemption because we want the same mm active when - * we probe the target and when we issue the hypercall. We'll - * have the same nominal mm, but if we're a kernel thread, lazy - * mm dropping could change our pgd. - * - * Out of an abundance of caution, this uses __get_user() to fault - * in the target address just in case there's some obscure case - * in which the target address isn't readable. - */ - - preempt_disable(); - - probe_kernel_read(&dummy, v, 1); - - if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) - BUG(); - - if (!PageHighMem(page)) { - void *av = __va(PFN_PHYS(pfn)); - - if (av != v) - if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) - BUG(); - } else - kmap_flush_unused(); - - preempt_enable(); -} - -static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) -{ - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; - int i; - - /* - * We need to mark the all aliases of the LDT pages RO. We - * don't need to call vm_flush_aliases(), though, since that's - * only responsible for flushing aliases out the TLBs, not the - * page tables, and Xen will flush the TLB for us if needed. - * - * To avoid confusing future readers: none of this is necessary - * to load the LDT. The hypervisor only checks this when the - * LDT is faulted in due to subsequent descriptor access. - */ - - for(i = 0; i < entries; i += entries_per_page) - set_aliased_prot(ldt + i, PAGE_KERNEL_RO); -} - -static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) -{ - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; - int i; - - for(i = 0; i < entries; i += entries_per_page) - set_aliased_prot(ldt + i, PAGE_KERNEL); -} - -static void xen_set_ldt(const void *addr, unsigned entries) -{ - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - trace_xen_cpu_set_ldt(addr, entries); - - op = mcs.args; - op->cmd = MMUEXT_SET_LDT; - op->arg1.linear_addr = (unsigned long)addr; - op->arg2.nr_ents = entries; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_load_gdt(const struct desc_ptr *dtr) -{ - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - int level; - pte_t *ptep; - unsigned long pfn, mfn; - void *virt; - - /* - * The GDT is per-cpu and is in the percpu data area. - * That can be virtually mapped, so we need to do a - * page-walk to get the underlying MFN for the - * hypercall. The page can also be in the kernel's - * linear range, so we need to RO that mapping too. - */ - ptep = lookup_address(va, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - mfn = pfn_to_mfn(pfn); - virt = __va(PFN_PHYS(pfn)); - - frames[f] = mfn; - - make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(virt); - } - - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) - BUG(); -} - -/* - * load_gdt for early boot, when the gdt is only mapped once - */ -static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) -{ - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - pte_t pte; - unsigned long pfn, mfn; - - pfn = virt_to_pfn(va); - mfn = pfn_to_mfn(pfn); - - pte = pfn_pte(pfn, PAGE_KERNEL_RO); - - if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) - BUG(); - - frames[f] = mfn; - } - - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) - BUG(); -} - -static inline bool desc_equal(const struct desc_struct *d1, - const struct desc_struct *d2) -{ - return d1->a == d2->a && d1->b == d2->b; -} - -static void load_TLS_descriptor(struct thread_struct *t, - unsigned int cpu, unsigned int i) -{ - struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; - struct desc_struct *gdt; - xmaddr_t maddr; - struct multicall_space mc; - - if (desc_equal(shadow, &t->tls_array[i])) - return; - - *shadow = t->tls_array[i]; - - gdt = get_cpu_gdt_table(cpu); - maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - mc = __xen_mc_entry(0); - - MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); -} - -static void xen_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * XXX sleazy hack: If we're being called in a lazy-cpu zone - * and lazy gs handling is enabled, it means we're in a - * context switch, and %gs has just been saved. This means we - * can zero it out to prevent faults on exit from the - * hypervisor if the next process has no %gs. Either way, it - * has been saved, and the new value will get loaded properly. - * This will go away as soon as Xen has been modified to not - * save/restore %gs for normal hypercalls. - * - * On x86_64, this hack is not used for %gs, because gs points - * to KERNEL_GS_BASE (and uses it for PDA references), so we - * must not zero %gs on x86_64 - * - * For x86_64, we need to zero %fs, otherwise we may get an - * exception between the new %fs descriptor being loaded and - * %fs being effectively cleared at __switch_to(). - */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { -#ifdef CONFIG_X86_32 - lazy_load_gs(0); -#else - loadsegment(fs, 0); -#endif - } - - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -#ifdef CONFIG_X86_64 -static void xen_load_gs_index(unsigned int idx) -{ - if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) - BUG(); -} -#endif - -static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - const void *ptr) -{ - xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); - u64 entry = *(u64 *)ptr; - - trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); - - preempt_disable(); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) - BUG(); - - preempt_enable(); -} - -static int cvt_gate_to_trap(int vector, const gate_desc *val, - struct trap_info *info) -{ - unsigned long addr; - - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) - return 0; - - info->vector = vector; - - addr = gate_offset(*val); -#ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ - return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } -#endif /* CONFIG_X86_64 */ - info->address = addr; - - info->cs = gate_segment(*val); - info->flags = val->dpl; - /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) - info->flags |= 1 << 2; - - return 1; -} - -/* Locations of each CPU's IDT */ -static DEFINE_PER_CPU(struct desc_ptr, idt_desc); - -/* Set an IDT entry. If the entry is part of the current IDT, then - also update Xen. */ -static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) -{ - unsigned long p = (unsigned long)&dt[entrynum]; - unsigned long start, end; - - trace_xen_cpu_write_idt_entry(dt, entrynum, g); - - preempt_disable(); - - start = __this_cpu_read(idt_desc.address); - end = start + __this_cpu_read(idt_desc.size) + 1; - - xen_mc_flush(); - - native_write_idt_entry(dt, entrynum, g); - - if (p >= start && (p + 8) <= end) { - struct trap_info info[2]; - - info[1].address = 0; - - if (cvt_gate_to_trap(entrynum, g, &info[0])) - if (HYPERVISOR_set_trap_table(info)) - BUG(); - } - - preempt_enable(); -} - -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) -{ - unsigned in, out, count; - - count = (desc->size+1) / sizeof(gate_desc); - BUG_ON(count > 256); - - for (in = out = 0; in < count; in++) { - gate_desc *entry = (gate_desc*)(desc->address) + in; - - if (cvt_gate_to_trap(in, entry, &traps[out])) - out++; - } - traps[out].address = 0; -} - -void xen_copy_trap_info(struct trap_info *traps) -{ - const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - - xen_convert_trap_info(desc, traps); -} - -/* Load a new IDT into Xen. In principle this can be per-CPU, so we - hold a spinlock to protect the static traps[] array (static because - it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct desc_ptr *desc) -{ - static DEFINE_SPINLOCK(lock); - static struct trap_info traps[257]; - - trace_xen_cpu_load_idt(desc); - - spin_lock(&lock); - - memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - - xen_convert_trap_info(desc, traps); - - xen_mc_flush(); - if (HYPERVISOR_set_trap_table(traps)) - BUG(); - - spin_unlock(&lock); -} - -/* Write a GDT descriptor entry. Ignore LDT descriptors, since - they're handled differently. */ -static void xen_write_gdt_entry(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); - - preempt_disable(); - - switch (type) { - case DESC_LDT: - case DESC_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) - BUG(); - } - - } - - preempt_enable(); -} - -/* - * Version of write_gdt_entry for use at early boot-time needed to - * update an entry as simply as possible. - */ -static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); - - switch (type) { - case DESC_LDT: - case DESC_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); - - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) - dt[entry] = *(struct desc_struct *)desc; - } - - } -} - -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - struct multicall_space mcs; - - mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; -} - -void xen_set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -static void xen_io_delay(void) -{ -} - -static DEFINE_PER_CPU(unsigned long, xen_cr0_value); - -static unsigned long xen_read_cr0(void) -{ - unsigned long cr0 = this_cpu_read(xen_cr0_value); - - if (unlikely(cr0 == 0)) { - cr0 = native_read_cr0(); - this_cpu_write(xen_cr0_value, cr0); - } - - return cr0; -} - -static void xen_write_cr0(unsigned long cr0) -{ - struct multicall_space mcs; - - this_cpu_write(xen_cr0_value, cr0); - - /* Only pay attention to cr0.TS; everything else is - ignored. */ - mcs = xen_mc_entry(0); - - MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_write_cr4(unsigned long cr4) -{ - cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); - - native_write_cr4(cr4); -} -#ifdef CONFIG_X86_64 -static inline unsigned long xen_read_cr8(void) -{ - return 0; -} -static inline void xen_write_cr8(unsigned long val) -{ - BUG_ON(val); -} -#endif - -static u64 xen_read_msr_safe(unsigned int msr, int *err) -{ - u64 val; - - if (pmu_msr_read(msr, &val, err)) - return val; - - val = native_read_msr_safe(msr, err); - switch (msr) { - case MSR_IA32_APICBASE: -#ifdef CONFIG_X86_X2APIC - if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) -#endif - val &= ~X2APIC_ENABLE; - break; - } - return val; -} - -static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) -{ - int ret; - - ret = 0; - - switch (msr) { -#ifdef CONFIG_X86_64 - unsigned which; - u64 base; - - case MSR_FS_BASE: which = SEGBASE_FS; goto set; - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; - - set: - base = ((u64)high << 32) | low; - if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EIO; - break; -#endif - - case MSR_STAR: - case MSR_CSTAR: - case MSR_LSTAR: - case MSR_SYSCALL_MASK: - case MSR_IA32_SYSENTER_CS: - case MSR_IA32_SYSENTER_ESP: - case MSR_IA32_SYSENTER_EIP: - /* Fast syscall setup is all done in hypercalls, so - these are all ignored. Stub them out here to stop - Xen console noise. */ - break; - - default: - if (!pmu_msr_write(msr, low, high, &ret)) - ret = native_write_msr_safe(msr, low, high); - } - - return ret; -} - -static u64 xen_read_msr(unsigned int msr) -{ - /* - * This will silently swallow a #GP from RDMSR. It may be worth - * changing that. - */ - int err; - - return xen_read_msr_safe(msr, &err); -} - -static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) -{ - /* - * This will silently swallow a #GP from WRMSR. It may be worth - * changing that. - */ - xen_write_msr_safe(msr, low, high); -} - -void xen_setup_shared_info(void) -{ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - set_fixmap(FIX_PARAVIRT_BOOTMAP, - xen_start_info->shared_info); - - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - } else - HYPERVISOR_shared_info = - (struct shared_info *)__va(xen_start_info->shared_info); - -#ifndef CONFIG_SMP - /* In UP this is as good a place as any to set up shared info */ - xen_setup_vcpu_info_placement(); -#endif - - xen_setup_mfn_list_list(); -} - -/* This is called once we have the cpu_possible_mask */ -void xen_setup_vcpu_info_placement(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - /* Set up direct vCPU id mapping for PV guests. */ - per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); - } - - /* - * xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. - */ - if (have_vcpu_info_placement) { - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); - pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_mmu_ops.read_cr2 = xen_read_cr2_direct; - } -} - -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - -static const struct pv_info xen_info __initconst = { - .shared_kernel_pmd = 0, - -#ifdef CONFIG_X86_64 - .extra_user_64bit_cs = FLAT_USER_CS64, -#endif - .name = "Xen", -}; - -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - -static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, - - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, - - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, - - .read_cr4 = native_read_cr4, - .write_cr4 = xen_write_cr4, - -#ifdef CONFIG_X86_64 - .read_cr8 = xen_read_cr8, - .write_cr8 = xen_write_cr8, -#endif - - .wbinvd = native_wbinvd, - - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, - - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, - - .read_pmc = xen_read_pmc, - - .iret = xen_iret, -#ifdef CONFIG_X86_64 - .usergs_sysret64 = xen_sysret64, -#endif - - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, -#ifdef CONFIG_X86_64 - .load_gs_index = xen_load_gs_index, -#endif - - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, - - .store_idt = native_store_idt, - .store_tr = xen_store_tr, - - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, - - .set_iopl_mask = xen_set_iopl_mask, - .io_delay = xen_io_delay, - - /* Xen takes care of %gs when switching to usermode for us */ - .swapgs = paravirt_nop, - - .start_context_switch = paravirt_start_context_switch, - .end_context_switch = xen_end_context_switch, -}; - -static void xen_reboot(int reason) +void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; int cpu; @@ -1306,33 +185,11 @@ static void xen_reboot(int reason) BUG(); } -static void xen_restart(char *msg) +void xen_emergency_restart(void) { xen_reboot(SHUTDOWN_reboot); } -static void xen_emergency_restart(void) -{ - xen_reboot(SHUTDOWN_reboot); -} - -static void xen_machine_halt(void) -{ - xen_reboot(SHUTDOWN_poweroff); -} - -static void xen_machine_power_off(void) -{ - if (pm_power_off) - pm_power_off(); - xen_reboot(SHUTDOWN_poweroff); -} - -static void xen_crash_shutdown(struct pt_regs *regs) -{ - xen_reboot(SHUTDOWN_crash); -} - static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -1342,7 +199,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) } static struct notifier_block xen_panic_block = { - .notifier_call= xen_panic_event, + .notifier_call = xen_panic_event, .priority = INT_MIN }; @@ -1352,627 +209,7 @@ int xen_panic_handler_init(void) return 0; } -static const struct machine_ops xen_machine_ops __initconst = { - .restart = xen_restart, - .halt = xen_machine_halt, - .power_off = xen_machine_power_off, - .shutdown = xen_machine_halt, - .crash_shutdown = xen_crash_shutdown, - .emergency_restart = xen_emergency_restart, -}; - -static unsigned char xen_get_nmi_reason(void) -{ - unsigned char reason = 0; - - /* Construct a value which looks like it came from port 0x61. */ - if (test_bit(_XEN_NMIREASON_io_error, - &HYPERVISOR_shared_info->arch.nmi_reason)) - reason |= NMI_REASON_IOCHK; - if (test_bit(_XEN_NMIREASON_pci_serr, - &HYPERVISOR_shared_info->arch.nmi_reason)) - reason |= NMI_REASON_SERR; - - return reason; -} - -static void __init xen_boot_params_init_edd(void) -{ -#if IS_ENABLED(CONFIG_EDD) - struct xen_platform_op op; - struct edd_info *edd_info; - u32 *mbr_signature; - unsigned nr; - int ret; - - edd_info = boot_params.eddbuf; - mbr_signature = boot_params.edd_mbr_sig_buffer; - - op.cmd = XENPF_firmware_info; - - op.u.firmware_info.type = XEN_FW_DISK_INFO; - for (nr = 0; nr < EDDMAXNR; nr++) { - struct edd_info *info = edd_info + nr; - - op.u.firmware_info.index = nr; - info->params.length = sizeof(info->params); - set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, - &info->params); - ret = HYPERVISOR_platform_op(&op); - if (ret) - break; - -#define C(x) info->x = op.u.firmware_info.u.disk_info.x - C(device); - C(version); - C(interface_support); - C(legacy_max_cylinder); - C(legacy_max_head); - C(legacy_sectors_per_track); -#undef C - } - boot_params.eddbuf_entries = nr; - - op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; - for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { - op.u.firmware_info.index = nr; - ret = HYPERVISOR_platform_op(&op); - if (ret) - break; - mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; - } - boot_params.edd_mbr_sig_buf_entries = nr; -#endif -} - -/* - * Set up the GDT and segment registers for -fstack-protector. Until - * we do this, we have to be careful not to call any stack-protected - * function, which is most of the kernel. - */ -static void xen_setup_gdt(int cpu) -{ - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; - pv_cpu_ops.load_gdt = xen_load_gdt_boot; - - setup_stack_canary_segment(0); - switch_to_new_gdt(0); - - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; - pv_cpu_ops.load_gdt = xen_load_gdt; -} - -static void __init xen_dom0_set_legacy_features(void) -{ - x86_platform.legacy.rtc = 1; -} - -static int xen_cpuhp_setup(void) -{ - int rc; - - rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "x86/xen/hvm_guest:prepare", - xen_cpu_up_prepare, xen_cpu_dead); - if (rc >= 0) { - rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "x86/xen/hvm_guest:online", - xen_cpu_up_online, NULL); - if (rc < 0) - cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); - } - - return rc >= 0 ? 0 : rc; -} - -/* First C function to be called on Xen boot */ -asmlinkage __visible void __init xen_start_kernel(void) -{ - struct physdev_set_iopl set_iopl; - unsigned long initrd_start = 0; - int rc; - - if (!xen_start_info) - return; - - xen_domain_type = XEN_PV_DOMAIN; - - xen_setup_features(); - - xen_setup_machphys_mapping(); - - /* Install Xen paravirt ops */ - pv_info = xen_info; - pv_init_ops = xen_init_ops; - pv_cpu_ops = xen_cpu_ops; - - x86_platform.get_nmi_reason = xen_get_nmi_reason; - - x86_init.resources.memory_setup = xen_memory_setup; - x86_init.oem.arch_setup = xen_arch_setup; - x86_init.oem.banner = xen_banner; - - xen_init_time_ops(); - - /* - * Set up some pagetable state before starting to set any ptes. - */ - - xen_init_mmu_ops(); - - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - - /* - * Prevent page tables from being allocated in highmem, even - * if CONFIG_HIGHPTE is enabled. - */ - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - - /* Work out if we support NX */ - x86_configure_nx(); - - /* Get mfn list */ - xen_build_dynamic_phys_to_machine(); - - /* - * Set up kernel GDT and segment registers, mainly so that - * -fstack-protector code can be executed. - */ - xen_setup_gdt(0); - - xen_init_irq_ops(); - xen_init_cpuid_mask(); - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * set up the basic apic ops. - */ - xen_init_apic(); -#endif - - if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { - pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; - pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; - } - - machine_ops = xen_machine_ops; - - /* - * The only reliable way to retain the initial address of the - * percpu gdt_page is to remember it here, so we can go and - * mark it RW later, when the initial percpu area is freed. - */ - xen_initial_gdt = &per_cpu(gdt_page, 0); - - xen_smp_init(); - -#ifdef CONFIG_ACPI_NUMA - /* - * The pages we from Xen are not related to machine pages, so - * any NUMA information the kernel tries to get from ACPI will - * be meaningless. Prevent it from trying. - */ - acpi_numa = -1; -#endif - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; - - WARN_ON(xen_cpuhp_setup()); - - local_irq_disable(); - early_boot_irqs_disabled = true; - - xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, - xen_start_info->nr_pages); - xen_reserve_special_pages(); - - /* keep using Xen gdt for now; no urgent need to change it */ - -#ifdef CONFIG_X86_32 - pv_info.kernel_rpl = 1; - if (xen_feature(XENFEAT_supervisor_mode_kernel)) - pv_info.kernel_rpl = 0; -#else - pv_info.kernel_rpl = 0; -#endif - /* set the limit of our address space */ - xen_reserve_top(); - - /* - * We used to do this in xen_arch_setup, but that is too late - * on AMD were early_cpu_init (run before ->arch_setup()) calls - * early_amd_init which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); - -#ifdef CONFIG_X86_32 - /* set up basic CPUID stuff */ - cpu_detect(&new_cpu_data); - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - new_cpu_data.wp_works_ok = 1; - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); -#endif - - if (xen_start_info->mod_start) { - if (xen_start_info->flags & SIF_MOD_START_PFN) - initrd_start = PFN_PHYS(xen_start_info->mod_start); - else - initrd_start = __pa(xen_start_info->mod_start); - } - - /* Poke various useful things into boot_params */ - boot_params.hdr.type_of_loader = (9 << 4) | 0; - boot_params.hdr.ramdisk_image = initrd_start; - boot_params.hdr.ramdisk_size = xen_start_info->mod_len; - boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; - - if (!xen_initial_domain()) { - add_preferred_console("xenboot", 0, NULL); - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); - if (pci_xen) - x86_init.pci.arch_init = pci_xen_init; - } else { - const struct dom0_vga_console_info *info = - (void *)((char *)xen_start_info + - xen_start_info->console.dom0.info_off); - struct xen_platform_op op = { - .cmd = XENPF_firmware_info, - .interface_version = XENPF_INTERFACE_VERSION, - .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, - }; - - x86_platform.set_legacy_features = - xen_dom0_set_legacy_features; - xen_init_vga(info, xen_start_info->console.dom0.info_size); - xen_start_info->console.domU.mfn = 0; - xen_start_info->console.domU.evtchn = 0; - - if (HYPERVISOR_platform_op(&op) == 0) - boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; - - /* Make sure ACS will be enabled */ - pci_request_acs(); - - xen_acpi_sleep_register(); - - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - - xen_boot_params_init_edd(); - } -#ifdef CONFIG_PCI - /* PCI BIOS service won't work from a PV guest. */ - pci_probe &= ~PCI_PROBE_BIOS; -#endif - xen_raw_console_write("about to get started...\n"); - - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - - xen_setup_runstate_info(0); - - xen_efi_init(); - - /* Start the world */ -#ifdef CONFIG_X86_32 - i386_start_kernel(); -#else - cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ - x86_64_start_reservations((char *)__pa_symbol(&boot_params)); -#endif -} - -#ifdef CONFIG_XEN_PVH - -static void xen_pvh_arch_setup(void) -{ -#ifdef CONFIG_ACPI - /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ - if (nr_ioapics == 0) - acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; -#endif -} - -static void __init init_pvh_bootparams(void) -{ - struct xen_memory_map memmap; - unsigned int i; - int rc; - - memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); - - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map); - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map); - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc) { - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); - BUG(); - } - - if (memmap.nr_entries < E820MAX - 1) { - pvh_bootparams.e820_map[memmap.nr_entries].addr = - ISA_START_ADDRESS; - pvh_bootparams.e820_map[memmap.nr_entries].size = - ISA_END_ADDRESS - ISA_START_ADDRESS; - pvh_bootparams.e820_map[memmap.nr_entries].type = - E820_RESERVED; - memmap.nr_entries++; - } else - xen_raw_printk("Warning: Can fit ISA range into e820\n"); - - sanitize_e820_map(pvh_bootparams.e820_map, - ARRAY_SIZE(pvh_bootparams.e820_map), - &memmap.nr_entries); - - pvh_bootparams.e820_entries = memmap.nr_entries; - for (i = 0; i < pvh_bootparams.e820_entries; i++) - e820_add_region(pvh_bootparams.e820_map[i].addr, - pvh_bootparams.e820_map[i].size, - pvh_bootparams.e820_map[i].type); - - pvh_bootparams.hdr.cmd_line_ptr = - pvh_start_info.cmdline_paddr; - - /* The first module is always ramdisk. */ - if (pvh_start_info.nr_modules) { - struct hvm_modlist_entry *modaddr = - __va(pvh_start_info.modlist_paddr); - pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; - pvh_bootparams.hdr.ramdisk_size = modaddr->size; - } - - /* - * See Documentation/x86/boot.txt. - * - * Version 2.12 supports Xen entry point but we will use default x86/PC - * environment (i.e. hardware_subarch 0). - */ - pvh_bootparams.hdr.version = 0x212; - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ -} - -/* - * This routine (and those that it might call) should not use - * anything that lives in .bss since that segment will be cleared later. - */ -void __init xen_prepare_pvh(void) -{ - u32 msr; - u64 pfn; - - if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", - pvh_start_info.magic); - BUG(); - } - - xen_pvh = 1; - - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - init_pvh_bootparams(); - - x86_init.oem.arch_setup = xen_pvh_arch_setup; -} -#endif - -void __ref xen_hvm_init_shared_info(void) -{ - int cpu; - struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page = 0; - - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); - xatp.domid = DOMID_SELF; - xatp.idx = 0; - xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; - if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) - BUG(); - - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; - - /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info - * page, we use it in the event channel upcall and in some pvclock - * related functions. We don't need the vcpu_info placement - * optimizations because we don't use any pv_mmu or pv_irq op on - * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and - * in that case multiple vcpus might be online. */ - for_each_online_cpu(cpu) { - /* Leave it to be NULL. */ - if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) - continue; - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - } -} - -#ifdef CONFIG_XEN_PVHVM -static void __init init_hvm_pv_info(void) -{ - int major, minor; - uint32_t eax, ebx, ecx, edx, base; - - base = xen_cpuid_base(); - eax = cpuid_eax(base + 1); - - major = eax >> 16; - minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", major, minor); - - xen_domain_type = XEN_HVM_DOMAIN; - - /* PVH set up hypercall page in xen_prepare_pvh(). */ - if (xen_pvh_domain()) - pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - - pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } - - xen_setup_features(); - - cpuid(base + 4, &eax, &ebx, &ecx, &edx); - if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) - this_cpu_write(xen_vcpu_id, ebx); - else - this_cpu_write(xen_vcpu_id, smp_processor_id()); -} -#endif - -static int xen_cpu_up_prepare(unsigned int cpu) -{ - int rc; - - if (xen_hvm_domain()) { - /* - * This can happen if CPU was offlined earlier and - * offlining timed out in common_cpu_die(). - */ - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - } - - if (cpu_acpi_id(cpu) != U32_MAX) - per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); - else - per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); - } - - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) - xen_setup_timer(cpu); - - rc = xen_smp_intr_init(cpu); - if (rc) { - WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", - cpu, rc); - return rc; - } - return 0; -} - -static int xen_cpu_dead(unsigned int cpu) -{ - xen_smp_intr_free(cpu); - - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) - xen_teardown_timer(cpu); - - return 0; -} - -static int xen_cpu_up_online(unsigned int cpu) -{ - xen_init_lock_cpu(cpu); - return 0; -} - -#ifdef CONFIG_XEN_PVHVM -#ifdef CONFIG_KEXEC_CORE -static void xen_hvm_shutdown(void) -{ - native_machine_shutdown(); - if (kexec_in_progress) - xen_reboot(SHUTDOWN_soft_reset); -} - -static void xen_hvm_crash_shutdown(struct pt_regs *regs) -{ - native_machine_crash_shutdown(regs); - xen_reboot(SHUTDOWN_soft_reset); -} -#endif - -static void __init xen_hvm_guest_init(void) -{ - if (xen_pv_domain()) - return; - - init_hvm_pv_info(); - - xen_hvm_init_shared_info(); - - xen_panic_handler_init(); - - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - - xen_hvm_smp_init(); - WARN_ON(xen_cpuhp_setup()); - xen_unplug_emulated_devices(); - x86_init.irqs.intr_init = xen_init_IRQ; - xen_hvm_init_time_ops(); - xen_hvm_init_mmu_ops(); - - if (xen_pvh_domain()) - machine_ops.emergency_restart = xen_emergency_restart; -#ifdef CONFIG_KEXEC_CORE - machine_ops.shutdown = xen_hvm_shutdown; - machine_ops.crash_shutdown = xen_hvm_crash_shutdown; -#endif -} -#endif - -static bool xen_nopv = false; -static __init int xen_parse_nopv(char *arg) -{ - xen_nopv = true; - return 0; -} -early_param("xen_nopv", xen_parse_nopv); - -static uint32_t __init xen_platform(void) -{ - if (xen_nopv) - return 0; - - return xen_cpuid_base(); -} - -bool xen_hvm_need_lapic(void) -{ - if (xen_nopv) - return false; - if (xen_pv_domain()) - return false; - if (!xen_hvm_domain()) - return false; - if (xen_feature(XENFEAT_hvm_pirqs)) - return false; - return true; -} -EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); - -static void xen_set_cpu_features(struct cpuinfo_x86 *c) -{ - if (xen_pv_domain()) { - clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - set_cpu_cap(c, X86_FEATURE_XENPV); - } -} - -static void xen_pin_vcpu(int cpu) +void xen_pin_vcpu(int cpu) { static bool disable_pinning; struct sched_pin_override pin_override; @@ -2011,18 +248,6 @@ static void xen_pin_vcpu(int cpu) } } -const struct hypervisor_x86 x86_hyper_xen = { - .name = "Xen", - .detect = xen_platform, -#ifdef CONFIG_XEN_PVHVM - .init_platform = xen_hvm_guest_init, -#endif - .x2apic_available = xen_x2apic_para_available, - .set_cpu_features = xen_set_cpu_features, - .pin_vcpu = xen_pin_vcpu, -}; -EXPORT_SYMBOL(x86_hyper_xen); - #ifdef CONFIG_HOTPLUG_CPU void xen_arch_register_cpu(int num) { diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c new file mode 100644 index 000000000000..a6d014f47e52 --- /dev/null +++ b/arch/x86/xen/enlighten_hvm.c @@ -0,0 +1,214 @@ +#include <linux/cpu.h> +#include <linux/kexec.h> + +#include <xen/features.h> +#include <xen/events.h> +#include <xen/interface/memory.h> + +#include <asm/cpu.h> +#include <asm/smp.h> +#include <asm/reboot.h> +#include <asm/setup.h> +#include <asm/hypervisor.h> + +#include <asm/xen/cpuid.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" +#include "mmu.h" +#include "smp.h" + +void __ref xen_hvm_init_shared_info(void) +{ + int cpu; + struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page; + + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + /* Leave it to be NULL. */ + if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) + continue; + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + } +} + +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, base; + + base = xen_cpuid_base(); + eax = cpuid_eax(base + 1); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + xen_domain_type = XEN_HVM_DOMAIN; + + /* PVH set up hypercall page in xen_prepare_pvh(). */ + if (xen_pvh_domain()) + pv_info.name = "Xen PVH"; + else { + u64 pfn; + uint32_t msr; + + pv_info.name = "Xen HVM"; + msr = cpuid_ebx(base + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + } + + xen_setup_features(); + + cpuid(base + 4, &eax, &ebx, &ecx, &edx); + if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) + this_cpu_write(xen_vcpu_id, ebx); + else + this_cpu_write(xen_vcpu_id, smp_processor_id()); +} + +#ifdef CONFIG_KEXEC_CORE +static void xen_hvm_shutdown(void) +{ + native_machine_shutdown(); + if (kexec_in_progress) + xen_reboot(SHUTDOWN_soft_reset); +} + +static void xen_hvm_crash_shutdown(struct pt_regs *regs) +{ + native_machine_crash_shutdown(regs); + xen_reboot(SHUTDOWN_soft_reset); +} +#endif + +static int xen_cpu_up_prepare_hvm(unsigned int cpu) +{ + int rc; + + /* + * This can happen if CPU was offlined earlier and + * offlining timed out in common_cpu_die(). + */ + if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + } + + if (cpu_acpi_id(cpu) != U32_MAX) + per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); + else + per_cpu(xen_vcpu_id, cpu) = cpu; + xen_vcpu_setup(cpu); + + if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_timer(cpu); + + rc = xen_smp_intr_init(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + return 0; +} + +static int xen_cpu_dead_hvm(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + + if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_teardown_timer(cpu); + + return 0; +} + +static void __init xen_hvm_guest_init(void) +{ + if (xen_pv_domain()) + return; + + init_hvm_pv_info(); + + xen_hvm_init_shared_info(); + + xen_panic_handler_init(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + + xen_hvm_smp_init(); + WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); + xen_unplug_emulated_devices(); + x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); + + if (xen_pvh_domain()) + machine_ops.emergency_restart = xen_emergency_restart; +#ifdef CONFIG_KEXEC_CORE + machine_ops.shutdown = xen_hvm_shutdown; + machine_ops.crash_shutdown = xen_hvm_crash_shutdown; +#endif +} + +static bool xen_nopv; +static __init int xen_parse_nopv(char *arg) +{ + xen_nopv = true; + return 0; +} +early_param("xen_nopv", xen_parse_nopv); + +bool xen_hvm_need_lapic(void) +{ + if (xen_nopv) + return false; + if (xen_pv_domain()) + return false; + if (!xen_hvm_domain()) + return false; + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) + return false; + return true; +} +EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); + +static uint32_t __init xen_platform_hvm(void) +{ + if (xen_pv_domain() || xen_nopv) + return 0; + + return xen_cpuid_base(); +} + +const struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_platform_hvm, + .init_platform = xen_hvm_guest_init, + .pin_vcpu = xen_pin_vcpu, + .x2apic_available = xen_x2apic_para_available, +}; +EXPORT_SYMBOL(x86_hyper_xen_hvm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c new file mode 100644 index 000000000000..a732bc2b9dfc --- /dev/null +++ b/arch/x86/xen/enlighten_pv.c @@ -0,0 +1,1513 @@ +/* + * Core of Xen paravirt_ops implementation. + * + * This file contains the xen_paravirt_ops structure itself, and the + * implementations for: + * - privileged instructions + * - interrupt flags + * - segment operations + * - booting and setup + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/preempt.h> +#include <linux/hardirq.h> +#include <linux/percpu.h> +#include <linux/delay.h> +#include <linux/start_kernel.h> +#include <linux/sched.h> +#include <linux/kprobes.h> +#include <linux/bootmem.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/highmem.h> +#include <linux/console.h> +#include <linux/pci.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/edd.h> +#include <linux/frame.h> + +#include <xen/xen.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/version.h> +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/memory.h> +#include <xen/interface/nmi.h> +#include <xen/interface/xen-mca.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/hvc-console.h> +#include <xen/acpi.h> + +#include <asm/paravirt.h> +#include <asm/apic.h> +#include <asm/page.h> +#include <asm/xen/pci.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/cpuid.h> +#include <asm/fixmap.h> +#include <asm/processor.h> +#include <asm/proto.h> +#include <asm/msr-index.h> +#include <asm/traps.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/reboot.h> +#include <asm/stackprotector.h> +#include <asm/hypervisor.h> +#include <asm/mach_traps.h> +#include <asm/mwait.h> +#include <asm/pci_x86.h> +#include <asm/cpu.h> + +#ifdef CONFIG_ACPI +#include <linux/acpi.h> +#include <asm/acpi.h> +#include <acpi/pdc_intel.h> +#include <acpi/processor.h> +#include <xen/interface/platform.h> +#endif + +#include "xen-ops.h" +#include "mmu.h" +#include "smp.h" +#include "multicalls.h" +#include "pmu.h" + +void *xen_initial_gdt; + +RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); + +static int xen_cpu_up_prepare_pv(unsigned int cpu); +static int xen_cpu_dead_pv(unsigned int cpu); + +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), + NULL); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) + BUG(); + + xen_setup_runstate_info(cpu); + + if (xen_have_vcpu_info_placement) + xen_vcpu_setup(cpu); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) + BUG(); + } +} + +static void __init xen_banner(void) +{ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + + pr_info("Booting paravirtualized kernel %son %s\n", + xen_feature(XENFEAT_auto_translated_physmap) ? + "with PVH extensions " : "", pv_info.name); + printk(KERN_INFO "Xen version: %d.%d%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); +} +/* Check if running on Xen version (major, minor) or later */ +bool +xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} + +static __read_mostly unsigned int cpuid_leaf5_ecx_val; +static __read_mostly unsigned int cpuid_leaf5_edx_val; + +static void xen_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) +{ + unsigned maskebx = ~0; + + /* + * Mask out inconvenient features, to try and disable as many + * unsupported kernel subsystems as possible. + */ + switch (*ax) { + case CPUID_MWAIT_LEAF: + /* Synthesize the values.. */ + *ax = 0; + *bx = 0; + *cx = cpuid_leaf5_ecx_val; + *dx = cpuid_leaf5_edx_val; + return; + + case 0xb: + /* Suppress extended topology stuff */ + maskebx = 0; + break; + } + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx)); + + *bx &= maskebx; +} +STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ + +static bool __init xen_check_mwait(void) +{ +#ifdef CONFIG_ACPI + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + uint32_t buf[3]; + unsigned int ax, bx, cx, dx; + unsigned int mwait_mask; + + /* We need to determine whether it is OK to expose the MWAIT + * capability to the kernel to harvest deeper than C3 states from ACPI + * _CST using the processor_harvest_xen.c module. For this to work, we + * need to gather the MWAIT_LEAF values (which the cstate.c code + * checks against). The hypervisor won't expose the MWAIT flag because + * it would break backwards compatibility; so we will find out directly + * from the hardware and hypercall. + */ + if (!xen_initial_domain()) + return false; + + /* + * When running under platform earlier than Xen4.2, do not expose + * mwait, to avoid the risk of loading native acpi pad driver + */ + if (!xen_running_on_version_or_later(4, 2)) + return false; + + ax = 1; + cx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + mwait_mask = (1 << (X86_FEATURE_EST % 32)) | + (1 << (X86_FEATURE_MWAIT % 32)); + + if ((cx & mwait_mask) != mwait_mask) + return false; + + /* We need to emulate the MWAIT_LEAF and for that we need both + * ecx and edx. The hypercall provides only partial information. + */ + + ax = CPUID_MWAIT_LEAF; + bx = 0; + cx = 0; + dx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. + */ + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + + if ((HYPERVISOR_platform_op(&op) == 0) && + (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + cpuid_leaf5_ecx_val = cx; + cpuid_leaf5_edx_val = dx; + } + return true; +#else + return false; +#endif +} + +static bool __init xen_check_xsave(void) +{ + unsigned int err, eax, edx; + + /* + * Xen 4.0 and older accidentally leaked the host XSAVE flag into guest + * view, despite not being able to support guests using the + * functionality. Probe for the actual availability of XSAVE by seeing + * whether xgetbv executes successfully or raises #UD. + */ + asm volatile("1: .byte 0x0f,0x01,0xd0\n\t" /* xgetbv */ + "xor %[err], %[err]\n" + "2:\n\t" + ".pushsection .fixup,\"ax\"\n\t" + "3: movl $1,%[err]\n\t" + "jmp 2b\n\t" + ".popsection\n\t" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), "=a" (eax), "=d" (edx) + : "c" (0)); + + return err == 0; +} + +static void __init xen_init_capabilities(void) +{ + setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS); + setup_force_cpu_cap(X86_FEATURE_XENPV); + setup_clear_cpu_cap(X86_FEATURE_DCA); + setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); + setup_clear_cpu_cap(X86_FEATURE_MTRR); + setup_clear_cpu_cap(X86_FEATURE_ACC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + + if (!xen_initial_domain()) + setup_clear_cpu_cap(X86_FEATURE_ACPI); + + if (xen_check_mwait()) + setup_force_cpu_cap(X86_FEATURE_MWAIT); + else + setup_clear_cpu_cap(X86_FEATURE_MWAIT); + + if (xen_check_xsave()) { + setup_force_cpu_cap(X86_FEATURE_XSAVE); + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + } else { + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_OSXSAVE); + } +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static void xen_end_context_switch(struct task_struct *next) +{ + xen_mc_flush(); + paravirt_end_context_switch(next); +} + +static unsigned long xen_store_tr(void) +{ + return 0; +} + +/* + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + unsigned char dummy; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + /* + * Careful: update_va_mapping() will fail if the virtual address + * we're poking isn't populated in the page tables. We don't + * need to worry about the direct map (that's always in the page + * tables), but we need to be careful about vmap space. In + * particular, the top level page table can lazily propagate + * entries between processes, so if we've switched mms since we + * vmapped the target in the first place, we might not have the + * top-level page table entry populated. + * + * We disable preemption because we want the same mm active when + * we probe the target and when we issue the hypercall. We'll + * have the same nominal mm, but if we're a kernel thread, lazy + * mm dropping could change our pgd. + * + * Out of an abundance of caution, this uses __get_user() to fault + * in the target address just in case there's some obscure case + * in which the target address isn't readable. + */ + + preempt_disable(); + + probe_kernel_read(&dummy, v, 1); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); + + preempt_enable(); +} + +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + /* + * We need to mark the all aliases of the LDT pages RO. We + * don't need to call vm_flush_aliases(), though, since that's + * only responsible for flushing aliases out the TLBs, not the + * page tables, and Xen will flush the TLB for us if needed. + * + * To avoid confusing future readers: none of this is necessary + * to load the LDT. The hypervisor only checks this when the + * LDT is faulted in due to subsequent descriptor access. + */ + + for (i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + for (i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); +} + +static void xen_set_ldt(const void *addr, unsigned entries) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + trace_xen_cpu_set_ldt(addr, entries); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)addr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_load_gdt(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + int level; + pte_t *ptep; + unsigned long pfn, mfn; + void *virt; + + /* + * The GDT is per-cpu and is in the percpu data area. + * That can be virtually mapped, so we need to do a + * page-walk to get the underlying MFN for the + * hypercall. The page can also be in the kernel's + * linear range, so we need to RO that mapping too. + */ + ptep = lookup_address(va, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + mfn = pfn_to_mfn(pfn); + virt = __va(PFN_PHYS(pfn)); + + frames[f] = mfn; + + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(virt); + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + +/* + * load_gdt for early boot, when the gdt is only mapped once + */ +static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + pte_t pte; + unsigned long pfn, mfn; + + pfn = virt_to_pfn(va); + mfn = pfn_to_mfn(pfn); + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) + BUG(); + + frames[f] = mfn; + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_rw(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + /* + * XXX sleazy hack: If we're being called in a lazy-cpu zone + * and lazy gs handling is enabled, it means we're in a + * context switch, and %gs has just been saved. This means we + * can zero it out to prevent faults on exit from the + * hypervisor if the next process has no %gs. Either way, it + * has been saved, and the new value will get loaded properly. + * This will go away as soon as Xen has been modified to not + * save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). + */ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 + lazy_load_gs(0); +#else + loadsegment(fs, 0); +#endif + } + + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); +} +#endif + +static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, + const void *ptr) +{ + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); + u64 entry = *(u64 *)ptr; + + trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); + + preempt_disable(); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); + + preempt_enable(); +} + +static int cvt_gate_to_trap(int vector, const gate_desc *val, + struct trap_info *info) +{ + unsigned long addr; + + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + return 0; + + info->vector = vector; + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else if (addr == (unsigned long)double_fault) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; +#endif + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } +#endif /* CONFIG_X86_64 */ + info->address = addr; + + info->cs = gate_segment(*val); + info->flags = val->dpl; + /* interrupt gates clear IF */ + if (val->type == GATE_INTERRUPT) + info->flags |= 1 << 2; + + return 1; +} + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct desc_ptr, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) +{ + unsigned long p = (unsigned long)&dt[entrynum]; + unsigned long start, end; + + trace_xen_cpu_write_idt_entry(dt, entrynum, g); + + preempt_disable(); + + start = __this_cpu_read(idt_desc.address); + end = start + __this_cpu_read(idt_desc.size) + 1; + + xen_mc_flush(); + + native_write_idt_entry(dt, entrynum, g); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, g, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } + + preempt_enable(); +} + +static void xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps) +{ + unsigned in, out, count; + + count = (desc->size+1) / sizeof(gate_desc); + BUG_ON(count > 256); + + for (in = out = 0; in < count; in++) { + gate_desc *entry = (gate_desc *)(desc->address) + in; + + if (cvt_gate_to_trap(in, entry, &traps[out])) + out++; + } + traps[out].address = 0; +} + +void xen_copy_trap_info(struct trap_info *traps) +{ + const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); + + xen_convert_trap_info(desc, traps); +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct desc_ptr *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + trace_xen_cpu_load_idt(desc); + + spin_lock(&lock); + + memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); + + xen_convert_trap_info(desc, traps); + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static void xen_write_gdt_entry(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + + preempt_disable(); + + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + BUG(); + } + + } + + preempt_enable(); +} + +/* + * Version of write_gdt_entry for use at early boot-time needed to + * update an entry as simply as possible. + */ +static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + dt[entry] = *(struct desc_struct *)desc; + } + + } +} + +static void xen_load_sp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + struct multicall_space mcs; + + mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); + tss->x86_tss.sp0 = thread->sp0; +} + +void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +static void xen_io_delay(void) +{ +} + +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); + +static unsigned long xen_read_cr0(void) +{ + unsigned long cr0 = this_cpu_read(xen_cr0_value); + + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + this_cpu_write(xen_cr0_value, cr0); + } + + return cr0; +} + +static void xen_write_cr0(unsigned long cr0) +{ + struct multicall_space mcs; + + this_cpu_write(xen_cr0_value, cr0); + + /* Only pay attention to cr0.TS; everything else is + ignored. */ + mcs = xen_mc_entry(0); + + MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_write_cr4(unsigned long cr4) +{ + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); + + native_write_cr4(cr4); +} +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif + +static u64 xen_read_msr_safe(unsigned int msr, int *err) +{ + u64 val; + + if (pmu_msr_read(msr, &val, err)) + return val; + + val = native_read_msr_safe(msr, err); + switch (msr) { + case MSR_IA32_APICBASE: +#ifdef CONFIG_X86_X2APIC + if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) +#endif + val &= ~X2APIC_ENABLE; + break; + } + return val; +} + +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int ret; + + ret = 0; + + switch (msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; + + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; + + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EIO; + break; +#endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + + default: + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); + } + + return ret; +} + +static u64 xen_read_msr(unsigned int msr) +{ + /* + * This will silently swallow a #GP from RDMSR. It may be worth + * changing that. + */ + int err; + + return xen_read_msr_safe(msr, &err); +} + +static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +{ + /* + * This will silently swallow a #GP from WRMSR. It may be worth + * changing that. + */ + xen_write_msr_safe(msr, low, high); +} + +void xen_setup_shared_info(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + +#ifndef CONFIG_SMP + /* In UP this is as good a place as any to set up shared info */ + xen_setup_vcpu_info_placement(); +#endif + + xen_setup_mfn_list_list(); +} + +/* This is called once we have the cpu_possible_mask */ +void xen_setup_vcpu_info_placement(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + /* Set up direct vCPU id mapping for PV guests. */ + per_cpu(xen_vcpu_id, cpu) = cpu; + xen_vcpu_setup(cpu); + } + + /* + * xen_vcpu_setup managed to place the vcpu_info within the + * percpu area for all cpus, so make use of it. + */ + if (xen_have_vcpu_info_placement) { + pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); + pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + } +} + +static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len) +{ + char *start, *end, *reloc; + unsigned ret; + + start = end = reloc = NULL; + +#define SITE(op, x) \ + case PARAVIRT_PATCH(op.x): \ + if (xen_have_vcpu_info_placement) { \ + start = (char *)xen_##x##_direct; \ + end = xen_##x##_direct_end; \ + reloc = xen_##x##_direct_reloc; \ + } \ + goto patch_site + + switch (type) { + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, save_fl); + SITE(pv_irq_ops, restore_fl); +#undef SITE + + patch_site: + if (start == NULL || (end-start) > len) + goto default_patch; + + ret = paravirt_patch_insns(insnbuf, len, start, end); + + /* Note: because reloc is assigned from something that + appears to be an array, gcc assumes it's non-null, + but doesn't know its relationship with start and + end. */ + if (reloc > start && reloc < end) { + int reloc_off = reloc - start; + long *relocp = (long *)(insnbuf + reloc_off); + long delta = start - (char *)addr; + + *relocp += delta; + } + break; + + default_patch: + default: + ret = paravirt_patch_default(type, clobbers, insnbuf, + addr, len); + break; + } + + return ret; +} + +static const struct pv_info xen_info __initconst = { + .shared_kernel_pmd = 0, + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", +}; + +static const struct pv_init_ops xen_init_ops __initconst = { + .patch = xen_patch, +}; + +static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, + + .read_cr4 = native_read_cr4, + .write_cr4 = xen_write_cr4, + +#ifdef CONFIG_X86_64 + .read_cr8 = xen_read_cr8, + .write_cr8 = xen_write_cr8, +#endif + + .wbinvd = native_wbinvd, + + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, + + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, + + .read_pmc = xen_read_pmc, + + .iret = xen_iret, +#ifdef CONFIG_X86_64 + .usergs_sysret64 = xen_sysret64, +#endif + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif + + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_sp0 = xen_load_sp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, + + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, +}; + +static void xen_restart(char *msg) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_machine_halt(void) +{ + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_crash_shutdown(struct pt_regs *regs) +{ + xen_reboot(SHUTDOWN_crash); +} + +static const struct machine_ops xen_machine_ops __initconst = { + .restart = xen_restart, + .halt = xen_machine_halt, + .power_off = xen_machine_power_off, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +}; + +static unsigned char xen_get_nmi_reason(void) +{ + unsigned char reason = 0; + + /* Construct a value which looks like it came from port 0x61. */ + if (test_bit(_XEN_NMIREASON_io_error, + &HYPERVISOR_shared_info->arch.nmi_reason)) + reason |= NMI_REASON_IOCHK; + if (test_bit(_XEN_NMIREASON_pci_serr, + &HYPERVISOR_shared_info->arch.nmi_reason)) + reason |= NMI_REASON_SERR; + + return reason; +} + +static void __init xen_boot_params_init_edd(void) +{ +#if IS_ENABLED(CONFIG_EDD) + struct xen_platform_op op; + struct edd_info *edd_info; + u32 *mbr_signature; + unsigned nr; + int ret; + + edd_info = boot_params.eddbuf; + mbr_signature = boot_params.edd_mbr_sig_buffer; + + op.cmd = XENPF_firmware_info; + + op.u.firmware_info.type = XEN_FW_DISK_INFO; + for (nr = 0; nr < EDDMAXNR; nr++) { + struct edd_info *info = edd_info + nr; + + op.u.firmware_info.index = nr; + info->params.length = sizeof(info->params); + set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, + &info->params); + ret = HYPERVISOR_platform_op(&op); + if (ret) + break; + +#define C(x) info->x = op.u.firmware_info.u.disk_info.x + C(device); + C(version); + C(interface_support); + C(legacy_max_cylinder); + C(legacy_max_head); + C(legacy_sectors_per_track); +#undef C + } + boot_params.eddbuf_entries = nr; + + op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; + for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { + op.u.firmware_info.index = nr; + ret = HYPERVISOR_platform_op(&op); + if (ret) + break; + mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; + } + boot_params.edd_mbr_sig_buf_entries = nr; +#endif +} + +/* + * Set up the GDT and segment registers for -fstack-protector. Until + * we do this, we have to be careful not to call any stack-protected + * function, which is most of the kernel. + */ +static void xen_setup_gdt(int cpu) +{ + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; + pv_cpu_ops.load_gdt = xen_load_gdt_boot; + + setup_stack_canary_segment(0); + switch_to_new_gdt(0); + + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; + pv_cpu_ops.load_gdt = xen_load_gdt; +} + +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + +/* First C function to be called on Xen boot */ +asmlinkage __visible void __init xen_start_kernel(void) +{ + struct physdev_set_iopl set_iopl; + unsigned long initrd_start = 0; + int rc; + + if (!xen_start_info) + return; + + xen_domain_type = XEN_PV_DOMAIN; + + xen_setup_features(); + + xen_setup_machphys_mapping(); + + /* Install Xen paravirt ops */ + pv_info = xen_info; + pv_init_ops = xen_init_ops; + pv_cpu_ops = xen_cpu_ops; + + x86_platform.get_nmi_reason = xen_get_nmi_reason; + + x86_init.resources.memory_setup = xen_memory_setup; + x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; + + xen_init_time_ops(); + + /* + * Set up some pagetable state before starting to set any ptes. + */ + + xen_init_mmu_ops(); + + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + + /* Work out if we support NX */ + x86_configure_nx(); + + /* Get mfn list */ + xen_build_dynamic_phys_to_machine(); + + /* + * Set up kernel GDT and segment registers, mainly so that + * -fstack-protector code can be executed. + */ + xen_setup_gdt(0); + + xen_init_irq_ops(); + xen_init_capabilities(); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * set up the basic apic ops. + */ + xen_init_apic(); +#endif + + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { + pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; + pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + } + + machine_ops = xen_machine_ops; + + /* + * The only reliable way to retain the initial address of the + * percpu gdt_page is to remember it here, so we can go and + * mark it RW later, when the initial percpu area is freed. + */ + xen_initial_gdt = &per_cpu(gdt_page, 0); + + xen_smp_init(); + +#ifdef CONFIG_ACPI_NUMA + /* + * The pages we from Xen are not related to machine pages, so + * any NUMA information the kernel tries to get from ACPI will + * be meaningless. Prevent it from trying. + */ + acpi_numa = -1; +#endif + /* Don't do the full vcpu_info placement stuff until we have a + possible map and a non-dummy shared_info. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + + WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); + + local_irq_disable(); + early_boot_irqs_disabled = true; + + xen_raw_console_write("mapping kernel into physical memory\n"); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); + + /* keep using Xen gdt for now; no urgent need to change it */ + +#ifdef CONFIG_X86_32 + pv_info.kernel_rpl = 1; + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + pv_info.kernel_rpl = 0; +#else + pv_info.kernel_rpl = 0; +#endif + /* set the limit of our address space */ + xen_reserve_top(); + + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + +#ifdef CONFIG_X86_32 + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); + new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); +#endif + + if (xen_start_info->mod_start) { + if (xen_start_info->flags & SIF_MOD_START_PFN) + initrd_start = PFN_PHYS(xen_start_info->mod_start); + else + initrd_start = __pa(xen_start_info->mod_start); + } + + /* Poke various useful things into boot_params */ + boot_params.hdr.type_of_loader = (9 << 4) | 0; + boot_params.hdr.ramdisk_image = initrd_start; + boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; + + if (!xen_initial_domain()) { + add_preferred_console("xenboot", 0, NULL); + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (pci_xen) + x86_init.pci.arch_init = pci_xen_init; + } else { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .interface_version = XENPF_INTERFACE_VERSION, + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, + }; + + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; + xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + + if (HYPERVISOR_platform_op(&op) == 0) + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + + /* Make sure ACS will be enabled */ + pci_request_acs(); + + xen_acpi_sleep_register(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + + xen_boot_params_init_edd(); + } +#ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ + pci_probe &= ~PCI_PROBE_BIOS; +#endif + xen_raw_console_write("about to get started...\n"); + + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + xen_setup_runstate_info(0); + + xen_efi_init(); + + /* Start the world */ +#ifdef CONFIG_X86_32 + i386_start_kernel(); +#else + cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); +#endif +} + +static int xen_cpu_up_prepare_pv(unsigned int cpu) +{ + int rc; + + xen_setup_timer(cpu); + + rc = xen_smp_intr_init(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + + rc = xen_smp_intr_init_pv(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + + return 0; +} + +static int xen_cpu_dead_pv(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + xen_smp_intr_free_pv(cpu); + + xen_teardown_timer(cpu); + + return 0; +} + +static uint32_t __init xen_platform_pv(void) +{ + if (xen_pv_domain()) + return xen_cpuid_base(); + + return 0; +} + +const struct hypervisor_x86 x86_hyper_xen_pv = { + .name = "Xen PV", + .detect = xen_platform_pv, + .pin_vcpu = xen_pin_vcpu, +}; +EXPORT_SYMBOL(x86_hyper_xen_pv); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c new file mode 100644 index 000000000000..98ab17673454 --- /dev/null +++ b/arch/x86/xen/enlighten_pvh.c @@ -0,0 +1,106 @@ +#include <linux/acpi.h> + +#include <xen/hvc-console.h> + +#include <asm/io_apic.h> +#include <asm/hypervisor.h> +#include <asm/e820/api.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypercall.h> + +#include <xen/interface/memory.h> +#include <xen/interface/hvm/start_info.h> + +/* + * PVH variables. + * + * xen_pvh and pvh_bootparams need to live in data segment since they + * are used after startup_{32|64}, which clear .bss, are invoked. + */ +bool xen_pvh __attribute__((section(".data"))) = 0; +struct boot_params pvh_bootparams __attribute__((section(".data"))); + +struct hvm_start_info pvh_start_info; +unsigned int pvh_start_info_sz = sizeof(pvh_start_info); + +static void xen_pvh_arch_setup(void) +{ + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (nr_ioapics == 0) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; +} + +static void __init init_pvh_bootparams(void) +{ + struct xen_memory_map memmap; + int rc; + + memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); + + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + pvh_bootparams.e820_entries = memmap.nr_entries; + + if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = + ISA_START_ADDRESS; + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].size = + ISA_END_ADDRESS - ISA_START_ADDRESS; + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = + E820_TYPE_RESERVED; + pvh_bootparams.e820_entries++; + } else + xen_raw_printk("Warning: Can fit ISA range into e820\n"); + + pvh_bootparams.hdr.cmd_line_ptr = + pvh_start_info.cmdline_paddr; + + /* The first module is always ramdisk. */ + if (pvh_start_info.nr_modules) { + struct hvm_modlist_entry *modaddr = + __va(pvh_start_info.modlist_paddr); + pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; + pvh_bootparams.hdr.ramdisk_size = modaddr->size; + } + + /* + * See Documentation/x86/boot.txt. + * + * Version 2.12 supports Xen entry point but we will use default x86/PC + * environment (i.e. hardware_subarch 0). + */ + pvh_bootparams.hdr.version = 0x212; + pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ +} + +/* + * This routine (and those that it might call) should not use + * anything that lives in .bss since that segment will be cleared later. + */ +void __init xen_prepare_pvh(void) +{ + u32 msr; + u64 pfn; + + if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", + pvh_start_info.magic); + BUG(); + } + + xen_pvh = 1; + + msr = cpuid_ebx(xen_cpuid_base() + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + init_pvh_bootparams(); + + x86_init.oem.arch_setup = xen_pvh_arch_setup; +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 37cb5aad71de..5e375a5e815f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1,84 +1,10 @@ -/* - * Xen mmu operations - * - * This file contains the various mmu fetch and update operations. - * The most important job they must perform is the mapping between the - * domain's pfn and the overall machine mfns. - * - * Xen allows guests to directly update the pagetable, in a controlled - * fashion. In other words, the guest modifies the same pagetable - * that the CPU actually uses, which eliminates the overhead of having - * a separate shadow pagetable. - * - * In order to allow this, it falls on the guest domain to map its - * notion of a "physical" pfn - which is just a domain-local linear - * address - into a real "machine address" which the CPU's MMU can - * use. - * - * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be - * inserted directly into the pagetable. When creating a new - * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, - * when reading the content back with __(pgd|pmd|pte)_val, it converts - * the mfn back into a pfn. - * - * The other constraint is that all pages which make up a pagetable - * must be mapped read-only in the guest. This prevents uncontrolled - * guest updates to the pagetable. Xen strictly enforces this, and - * will disallow any pagetable update which will end up mapping a - * pagetable page RW, and will disallow using any writable page as a - * pagetable. - * - * Naively, when loading %cr3 with the base of a new pagetable, Xen - * would need to validate the whole pagetable before going on. - * Naturally, this is quite slow. The solution is to "pin" a - * pagetable, which enforces all the constraints on the pagetable even - * when it is not actively in use. This menas that Xen can be assured - * that it is still valid when you do load it into %cr3, and doesn't - * need to revalidate it. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ -#include <linux/sched/mm.h> -#include <linux/highmem.h> -#include <linux/debugfs.h> -#include <linux/bug.h> -#include <linux/vmalloc.h> -#include <linux/export.h> -#include <linux/init.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/seq_file.h> -#include <linux/crash_dump.h> - -#include <trace/events/xen.h> - -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/fixmap.h> -#include <asm/mmu_context.h> -#include <asm/setup.h> -#include <asm/paravirt.h> -#include <asm/e820.h> -#include <asm/linkage.h> -#include <asm/page.h> -#include <asm/init.h> -#include <asm/pat.h> -#include <asm/smp.h> - +#include <linux/pfn.h> +#include <asm/xen/page.h> #include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/xen.h> -#include <xen/page.h> -#include <xen/interface/xen.h> -#include <xen/interface/hvm/hvm_op.h> -#include <xen/interface/version.h> #include <xen/interface/memory.h> -#include <xen/hvc-console.h> #include "multicalls.h" #include "mmu.h" -#include "debugfs.h" /* * Protects atomic reservation decrease/increase against concurrent increases. @@ -86,45 +12,6 @@ */ DEFINE_SPINLOCK(xen_reservation_lock); -#ifdef CONFIG_X86_32 -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) -static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); -#endif -#ifdef CONFIG_X86_64 -/* l3 pud for userspace vsyscall mapping */ -static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; -#endif /* CONFIG_X86_64 */ - -/* - * Note about cr3 (pagetable base) values: - * - * xen_cr3 contains the current logical cr3 value; it contains the - * last set cr3. This may not be the current effective cr3, because - * its update may be being lazily deferred. However, a vcpu looking - * at its own cr3 can use this value knowing that it everything will - * be self-consistent. - * - * xen_current_cr3 contains the actual vcpu cr3; it is set once the - * hypercall to set the vcpu cr3 is complete (so it may be a little - * out of date, but it will never be set early). If one vcpu is - * looking at another vcpu's cr3 value, it should use this variable. - */ -DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ - -static phys_addr_t xen_pt_base, xen_pt_size __initdata; - -/* - * Just beyond the highest usermode address. STACK_TOP_MAX has a - * redzone above it, so round it up to a PGD boundary. - */ -#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); @@ -155,1163 +42,6 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) } EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); -void make_lowmem_page_readonly(void *vaddr) -{ - pte_t *pte, ptev; - unsigned long address = (unsigned long)vaddr; - unsigned int level; - - pte = lookup_address(address, &level); - if (pte == NULL) - return; /* vaddr missing */ - - ptev = pte_wrprotect(*pte); - - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) - BUG(); -} - -void make_lowmem_page_readwrite(void *vaddr) -{ - pte_t *pte, ptev; - unsigned long address = (unsigned long)vaddr; - unsigned int level; - - pte = lookup_address(address, &level); - if (pte == NULL) - return; /* vaddr missing */ - - ptev = pte_mkwrite(*pte); - - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) - BUG(); -} - - -static bool xen_page_pinned(void *ptr) -{ - struct page *page = virt_to_page(ptr); - - return PagePinned(page); -} - -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) -{ - struct multicall_space mcs; - struct mmu_update *u; - - trace_xen_mmu_set_domain_pte(ptep, pteval, domid); - - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - - /* ptep might be kmapped when using 32-bit HIGHPTE */ - u->ptr = virt_to_machine(ptep).maddr; - u->val = pte_val_ma(pteval); - - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} -EXPORT_SYMBOL_GPL(xen_set_domain_pte); - -static void xen_extend_mmu_update(const struct mmu_update *update) -{ - struct multicall_space mcs; - struct mmu_update *u; - - mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - - if (mcs.mc != NULL) { - mcs.mc->args[1]++; - } else { - mcs = __xen_mc_entry(sizeof(*u)); - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); - } - - u = mcs.args; - *u = *update; -} - -static void xen_extend_mmuext_op(const struct mmuext_op *op) -{ - struct multicall_space mcs; - struct mmuext_op *u; - - mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); - - if (mcs.mc != NULL) { - mcs.mc->args[1]++; - } else { - mcs = __xen_mc_entry(sizeof(*u)); - MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); - } - - u = mcs.args; - *u = *op; -} - -static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) -{ - struct mmu_update u; - - preempt_disable(); - - xen_mc_batch(); - - /* ptr may be ioremapped for 64-bit pagetable setup */ - u.ptr = arbitrary_virt_to_machine(ptr).maddr; - u.val = pmd_val_ma(val); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_set_pmd(pmd_t *ptr, pmd_t val) -{ - trace_xen_mmu_set_pmd(ptr, val); - - /* If page is not pinned, we can just update the entry - directly */ - if (!xen_page_pinned(ptr)) { - *ptr = val; - return; - } - - xen_set_pmd_hyper(ptr, val); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) -{ - set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); -} - -static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) -{ - struct mmu_update u; - - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) - return false; - - xen_mc_batch(); - - u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; - u.val = pte_val_ma(pteval); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - return true; -} - -static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) -{ - if (!xen_batched_set_pte(ptep, pteval)) { - /* - * Could call native_set_pte() here and trap and - * emulate the PTE write but with 32-bit guests this - * needs two traps (one for each of the two 32-bit - * words in the PTE) so do one hypercall directly - * instead. - */ - struct mmu_update u; - - u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; - u.val = pte_val_ma(pteval); - HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); - } -} - -static void xen_set_pte(pte_t *ptep, pte_t pteval) -{ - trace_xen_mmu_set_pte(ptep, pteval); - __xen_set_pte(ptep, pteval); -} - -static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); - __xen_set_pte(ptep, pteval); -} - -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - /* Just return the pte as-is. We preserve the bits on commit */ - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); - return *ptep; -} - -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - struct mmu_update u; - - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); - xen_mc_batch(); - - u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; - u.val = pte_val_ma(pte); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -/* Assume pteval_t is equivalent to all the other *val_t types. */ -static pteval_t pte_mfn_to_pfn(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - unsigned long pfn = mfn_to_pfn(mfn); - - pteval_t flags = val & PTE_FLAGS_MASK; - if (unlikely(pfn == ~0)) - val = flags & ~_PAGE_PRESENT; - else - val = ((pteval_t)pfn << PAGE_SHIFT) | flags; - } - - return val; -} - -static pteval_t pte_pfn_to_mfn(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & PTE_FLAGS_MASK; - unsigned long mfn; - - if (!xen_feature(XENFEAT_auto_translated_physmap)) - mfn = __pfn_to_mfn(pfn); - else - mfn = pfn; - /* - * If there's no mfn for the pfn, then just create an - * empty non-present pte. Unfortunately this loses - * information about the original pfn, so - * pte_mfn_to_pfn is asymmetric. - */ - if (unlikely(mfn == INVALID_P2M_ENTRY)) { - mfn = 0; - flags = 0; - } else - mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; - } - - return val; -} - -__visible pteval_t xen_pte_val(pte_t pte) -{ - pteval_t pteval = pte.pte; - - return pte_mfn_to_pfn(pteval); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); - -__visible pgdval_t xen_pgd_val(pgd_t pgd) -{ - return pte_mfn_to_pfn(pgd.pgd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); - -__visible pte_t xen_make_pte(pteval_t pte) -{ - pte = pte_pfn_to_mfn(pte); - - return native_make_pte(pte); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); - -__visible pgd_t xen_make_pgd(pgdval_t pgd) -{ - pgd = pte_pfn_to_mfn(pgd); - return native_make_pgd(pgd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); - -__visible pmdval_t xen_pmd_val(pmd_t pmd) -{ - return pte_mfn_to_pfn(pmd.pmd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); - -static void xen_set_pud_hyper(pud_t *ptr, pud_t val) -{ - struct mmu_update u; - - preempt_disable(); - - xen_mc_batch(); - - /* ptr may be ioremapped for 64-bit pagetable setup */ - u.ptr = arbitrary_virt_to_machine(ptr).maddr; - u.val = pud_val_ma(val); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_set_pud(pud_t *ptr, pud_t val) -{ - trace_xen_mmu_set_pud(ptr, val); - - /* If page is not pinned, we can just update the entry - directly */ - if (!xen_page_pinned(ptr)) { - *ptr = val; - return; - } - - xen_set_pud_hyper(ptr, val); -} - -#ifdef CONFIG_X86_PAE -static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - trace_xen_mmu_set_pte_atomic(ptep, pte); - set_64bit((u64 *)ptep, native_pte_val(pte)); -} - -static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - trace_xen_mmu_pte_clear(mm, addr, ptep); - if (!xen_batched_set_pte(ptep, native_make_pte(0))) - native_pte_clear(mm, addr, ptep); -} - -static void xen_pmd_clear(pmd_t *pmdp) -{ - trace_xen_mmu_pmd_clear(pmdp); - set_pmd(pmdp, __pmd(0)); -} -#endif /* CONFIG_X86_PAE */ - -__visible pmd_t xen_make_pmd(pmdval_t pmd) -{ - pmd = pte_pfn_to_mfn(pmd); - return native_make_pmd(pmd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); - -#if CONFIG_PGTABLE_LEVELS == 4 -__visible pudval_t xen_pud_val(pud_t pud) -{ - return pte_mfn_to_pfn(pud.pud); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); - -__visible pud_t xen_make_pud(pudval_t pud) -{ - pud = pte_pfn_to_mfn(pud); - - return native_make_pud(pud); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); - -static pgd_t *xen_get_user_pgd(pgd_t *pgd) -{ - pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); - unsigned offset = pgd - pgd_page; - pgd_t *user_ptr = NULL; - - if (offset < pgd_index(USER_LIMIT)) { - struct page *page = virt_to_page(pgd_page); - user_ptr = (pgd_t *)page->private; - if (user_ptr) - user_ptr += offset; - } - - return user_ptr; -} - -static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) -{ - struct mmu_update u; - - u.ptr = virt_to_machine(ptr).maddr; - u.val = pgd_val_ma(val); - xen_extend_mmu_update(&u); -} - -/* - * Raw hypercall-based set_pgd, intended for in early boot before - * there's a page structure. This implies: - * 1. The only existing pagetable is the kernel's - * 2. It is always pinned - * 3. It has no user pagetable attached to it - */ -static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) -{ - preempt_disable(); - - xen_mc_batch(); - - __xen_set_pgd_hyper(ptr, val); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_set_pgd(pgd_t *ptr, pgd_t val) -{ - pgd_t *user_ptr = xen_get_user_pgd(ptr); - - trace_xen_mmu_set_pgd(ptr, user_ptr, val); - - /* If page is not pinned, we can just update the entry - directly */ - if (!xen_page_pinned(ptr)) { - *ptr = val; - if (user_ptr) { - WARN_ON(xen_page_pinned(user_ptr)); - *user_ptr = val; - } - return; - } - - /* If it's pinned, then we can at least batch the kernel and - user updates together. */ - xen_mc_batch(); - - __xen_set_pgd_hyper(ptr, val); - if (user_ptr) - __xen_set_pgd_hyper(user_ptr, val); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ - -/* - * (Yet another) pagetable walker. This one is intended for pinning a - * pagetable. This means that it walks a pagetable and calls the - * callback function on each page it finds making up the page table, - * at every level. It walks the entire pagetable, but it only bothers - * pinning pte pages which are below limit. In the normal case this - * will be STACK_TOP_MAX, but at boot we need to pin up to - * FIXADDR_TOP. - * - * For 32-bit the important bit is that we don't pin beyond there, - * because then we start getting into Xen's ptes. - * - * For 64-bit, we must skip the Xen hole in the middle of the address - * space, just after the big x86-64 virtual hole. - */ -static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, - int (*func)(struct mm_struct *mm, struct page *, - enum pt_level), - unsigned long limit) -{ - int flush = 0; - unsigned hole_low, hole_high; - unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; - unsigned pgdidx, pudidx, pmdidx; - - /* The limit is the last byte to be touched */ - limit--; - BUG_ON(limit >= FIXADDR_TOP); - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - /* - * 64-bit has a great big hole in the middle of the address - * space, which contains the Xen mappings. On 32-bit these - * will end up making a zero-sized hole and so is a no-op. - */ - hole_low = pgd_index(USER_LIMIT); - hole_high = pgd_index(PAGE_OFFSET); - - pgdidx_limit = pgd_index(limit); -#if PTRS_PER_PUD > 1 - pudidx_limit = pud_index(limit); -#else - pudidx_limit = 0; -#endif -#if PTRS_PER_PMD > 1 - pmdidx_limit = pmd_index(limit); -#else - pmdidx_limit = 0; -#endif - - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { - pud_t *pud; - - if (pgdidx >= hole_low && pgdidx < hole_high) - continue; - - if (!pgd_val(pgd[pgdidx])) - continue; - - pud = pud_offset(&pgd[pgdidx], 0); - - if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - - for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { - pmd_t *pmd; - - if (pgdidx == pgdidx_limit && - pudidx > pudidx_limit) - goto out; - - if (pud_none(pud[pudidx])) - continue; - - pmd = pmd_offset(&pud[pudidx], 0); - - if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); - - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { - struct page *pte; - - if (pgdidx == pgdidx_limit && - pudidx == pudidx_limit && - pmdidx > pmdidx_limit) - goto out; - - if (pmd_none(pmd[pmdidx])) - continue; - - pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(mm, pte, PT_PTE); - } - } - } - -out: - /* Do the top level last, so that the callbacks can use it as - a cue to do final things like tlb flushes. */ - flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); - - return flush; -} - -static int xen_pgd_walk(struct mm_struct *mm, - int (*func)(struct mm_struct *mm, struct page *, - enum pt_level), - unsigned long limit) -{ - return __xen_pgd_walk(mm, mm->pgd, func, limit); -} - -/* If we're using split pte locks, then take the page's lock and - return a pointer to it. Otherwise return NULL. */ -static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) -{ - spinlock_t *ptl = NULL; - -#if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); - spin_lock_nest_lock(ptl, &mm->page_table_lock); -#endif - - return ptl; -} - -static void xen_pte_unlock(void *v) -{ - spinlock_t *ptl = v; - spin_unlock(ptl); -} - -static void xen_do_pin(unsigned level, unsigned long pfn) -{ - struct mmuext_op op; - - op.cmd = level; - op.arg1.mfn = pfn_to_mfn(pfn); - - xen_extend_mmuext_op(&op); -} - -static int xen_pin_page(struct mm_struct *mm, struct page *page, - enum pt_level level) -{ - unsigned pgfl = TestSetPagePinned(page); - int flush; - - if (pgfl) - flush = 0; /* already pinned */ - else if (PageHighMem(page)) - /* kmaps need flushing if we found an unpinned - highpage */ - flush = 1; - else { - void *pt = lowmem_page_address(page); - unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); - spinlock_t *ptl; - - flush = 0; - - /* - * We need to hold the pagetable lock between the time - * we make the pagetable RO and when we actually pin - * it. If we don't, then other users may come in and - * attempt to update the pagetable by writing it, - * which will fail because the memory is RO but not - * pinned, so Xen won't do the trap'n'emulate. - * - * If we're using split pte locks, we can't hold the - * entire pagetable's worth of locks during the - * traverse, because we may wrap the preempt count (8 - * bits). The solution is to mark RO and pin each PTE - * page while holding the lock. This means the number - * of locks we end up holding is never more than a - * batch size (~32 entries, at present). - * - * If we're not using split pte locks, we needn't pin - * the PTE pages independently, because we're - * protected by the overall pagetable lock. - */ - ptl = NULL; - if (level == PT_PTE) - ptl = xen_pte_lock(page, mm); - - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, - pfn_pte(pfn, PAGE_KERNEL_RO), - level == PT_PGD ? UVMF_TLB_FLUSH : 0); - - if (ptl) { - xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - - /* Queue a deferred unlock for when this batch - is completed. */ - xen_mc_callback(xen_pte_unlock, ptl); - } - } - - return flush; -} - -/* This is called just after a mm has been created, but it has not - been used yet. We need to make sure that its pagetable is all - read-only, and can be pinned. */ -static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) -{ - trace_xen_mmu_pgd_pin(mm, pgd); - - xen_mc_batch(); - - if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { - /* re-enable interrupts for flushing */ - xen_mc_issue(0); - - kmap_flush_unused(); - - xen_mc_batch(); - } - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); - - if (user_pgd) { - xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa(user_pgd))); - } - } -#else /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_PAE - /* Need to make sure unshared kernel PMD is pinnable */ - xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), - PT_PMD); -#endif - xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); -#endif /* CONFIG_X86_64 */ - xen_mc_issue(0); -} - -static void xen_pgd_pin(struct mm_struct *mm) -{ - __xen_pgd_pin(mm, mm->pgd); -} - -/* - * On save, we need to pin all pagetables to make sure they get their - * mfns turned into pfns. Search the list for any unpinned pgds and pin - * them (unpinned pgds are not currently in use, probably because the - * process is under construction or destruction). - * - * Expected to be called in stop_machine() ("equivalent to taking - * every spinlock in the system"), so the locking doesn't really - * matter all that much. - */ -void xen_mm_pin_all(void) -{ - struct page *page; - - spin_lock(&pgd_lock); - - list_for_each_entry(page, &pgd_list, lru) { - if (!PagePinned(page)) { - __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); - SetPageSavePinned(page); - } - } - - spin_unlock(&pgd_lock); -} - -/* - * The init_mm pagetable is really pinned as soon as its created, but - * that's before we have page structures to store the bits. So do all - * the book-keeping now. - */ -static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, - enum pt_level level) -{ - SetPagePinned(page); - return 0; -} - -static void __init xen_mark_init_mm_pinned(void) -{ - xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); -} - -static int xen_unpin_page(struct mm_struct *mm, struct page *page, - enum pt_level level) -{ - unsigned pgfl = TestClearPagePinned(page); - - if (pgfl && !PageHighMem(page)) { - void *pt = lowmem_page_address(page); - unsigned long pfn = page_to_pfn(page); - spinlock_t *ptl = NULL; - struct multicall_space mcs; - - /* - * Do the converse to pin_page. If we're using split - * pte locks, we must be holding the lock for while - * the pte page is unpinned but still RO to prevent - * concurrent updates from seeing it in this - * partially-pinned state. - */ - if (level == PT_PTE) { - ptl = xen_pte_lock(page, mm); - - if (ptl) - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); - } - - mcs = __xen_mc_entry(0); - - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, - pfn_pte(pfn, PAGE_KERNEL), - level == PT_PGD ? UVMF_TLB_FLUSH : 0); - - if (ptl) { - /* unlock when batch completed */ - xen_mc_callback(xen_pte_unlock, ptl); - } - } - - return 0; /* never need to flush on unpin */ -} - -/* Release a pagetables pages back as normal RW */ -static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) -{ - trace_xen_mmu_pgd_unpin(mm, pgd); - - xen_mc_batch(); - - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(user_pgd))); - xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); - } - } -#endif - -#ifdef CONFIG_X86_PAE - /* Need to make sure unshared kernel PMD is unpinned */ - xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), - PT_PMD); -#endif - - __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); - - xen_mc_issue(0); -} - -static void xen_pgd_unpin(struct mm_struct *mm) -{ - __xen_pgd_unpin(mm, mm->pgd); -} - -/* - * On resume, undo any pinning done at save, so that the rest of the - * kernel doesn't see any unexpected pinned pagetables. - */ -void xen_mm_unpin_all(void) -{ - struct page *page; - - spin_lock(&pgd_lock); - - list_for_each_entry(page, &pgd_list, lru) { - if (PageSavePinned(page)) { - BUG_ON(!PagePinned(page)); - __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); - ClearPageSavePinned(page); - } - } - - spin_unlock(&pgd_lock); -} - -static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next); - spin_unlock(&next->page_table_lock); -} - -static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm); - spin_unlock(&mm->page_table_lock); -} - - -#ifdef CONFIG_SMP -/* Another cpu may still have their %cr3 pointing at the pagetable, so - we need to repoint it somewhere else before we can unpin it. */ -static void drop_other_mm_ref(void *info) -{ - struct mm_struct *mm = info; - struct mm_struct *active_mm; - - active_mm = this_cpu_read(cpu_tlbstate.active_mm); - - if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) - leave_mm(smp_processor_id()); - - /* If this cpu still has a stale cr3 reference, then make sure - it has been flushed. */ - if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) - load_cr3(swapper_pg_dir); -} - -static void xen_drop_mm_ref(struct mm_struct *mm) -{ - cpumask_var_t mask; - unsigned cpu; - - if (current->active_mm == mm) { - if (current->mm == mm) - load_cr3(swapper_pg_dir); - else - leave_mm(smp_processor_id()); - } - - /* Get the "official" set of cpus referring to our pagetable. */ - if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) - continue; - smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); - } - return; - } - cpumask_copy(mask, mm_cpumask(mm)); - - /* It's possible that a vcpu may have a stale reference to our - cr3, because its in lazy mode, and it hasn't yet flushed - its set of pending hypercalls yet. In this case, we can - look at its actual current cr3 value, and force it to flush - if needed. */ - for_each_online_cpu(cpu) { - if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpumask_set_cpu(cpu, mask); - } - - if (!cpumask_empty(mask)) - smp_call_function_many(mask, drop_other_mm_ref, mm, 1); - free_cpumask_var(mask); -} -#else -static void xen_drop_mm_ref(struct mm_struct *mm) -{ - if (current->active_mm == mm) - load_cr3(swapper_pg_dir); -} -#endif - -/* - * While a process runs, Xen pins its pagetables, which means that the - * hypervisor forces it to be read-only, and it controls all updates - * to it. This means that all pagetable updates have to go via the - * hypervisor, which is moderately expensive. - * - * Since we're pulling the pagetable down, we switch to use init_mm, - * unpin old process pagetable and mark it all read-write, which - * allows further operations on it to be simple memory accesses. - * - * The only subtle point is that another CPU may be still using the - * pagetable because of lazy tlb flushing. This means we need need to - * switch all CPUs off this pagetable before we can unpin it. - */ -static void xen_exit_mmap(struct mm_struct *mm) -{ - get_cpu(); /* make sure we don't move around */ - xen_drop_mm_ref(mm); - put_cpu(); - - spin_lock(&mm->page_table_lock); - - /* pgd may not be pinned in the error exit path of execve */ - if (xen_page_pinned(mm->pgd)) - xen_pgd_unpin(mm); - - spin_unlock(&mm->page_table_lock); -} - -static void xen_post_allocator_init(void); - -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - -#ifdef CONFIG_X86_64 -static void __init xen_cleanhighmap(unsigned long vaddr, - unsigned long vaddr_end) -{ - unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; - pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); - - /* NOTE: The loop is more greedy than the cleanup_highmap variant. - * We include the PMD passed in on _both_ boundaries. */ - for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); - pmd++, vaddr += PMD_SIZE) { - if (pmd_none(*pmd)) - continue; - if (vaddr < (unsigned long) _text || vaddr > kernel_end) - set_pmd(pmd, __pmd(0)); - } - /* In case we did something silly, we should crash in this function - * instead of somewhere later and be confusing. */ - xen_mc_flush(); -} - -/* - * Make a page range writeable and free it. - */ -static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) -{ - void *vaddr = __va(paddr); - void *vaddr_end = vaddr + size; - - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) - make_lowmem_page_readwrite(vaddr); - - memblock_free(paddr, size); -} - -static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) -{ - unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; - - if (unpin) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); - ClearPagePinned(virt_to_page(__va(pa))); - xen_free_ro_pages(pa, PAGE_SIZE); -} - -/* - * Since it is well isolated we can (and since it is perhaps large we should) - * also free the page tables mapping the initial P->M table. - */ -static void __init xen_cleanmfnmap(unsigned long vaddr) -{ - unsigned long va = vaddr & PMD_MASK; - unsigned long pa; - pgd_t *pgd = pgd_offset_k(va); - pud_t *pud_page = pud_offset(pgd, 0); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned int i; - bool unpin; - - unpin = (vaddr == 2 * PGDIR_SIZE); - set_pgd(pgd, __pgd(0)); - do { - pud = pud_page + pud_index(va); - if (pud_none(*pud)) { - va += PUD_SIZE; - } else if (pud_large(*pud)) { - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PUD_SIZE); - va += PUD_SIZE; - } else { - pmd = pmd_offset(pud, va); - if (pmd_large(*pmd)) { - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PMD_SIZE); - } else if (!pmd_none(*pmd)) { - pte = pte_offset_kernel(pmd, va); - set_pmd(pmd, __pmd(0)); - for (i = 0; i < PTRS_PER_PTE; ++i) { - if (pte_none(pte[i])) - break; - pa = pte_pfn(pte[i]) << PAGE_SHIFT; - xen_free_ro_pages(pa, PAGE_SIZE); - } - xen_cleanmfnmap_free_pgtbl(pte, unpin); - } - va += PMD_SIZE; - if (pmd_index(va)) - continue; - set_pud(pud, __pud(0)); - xen_cleanmfnmap_free_pgtbl(pmd, unpin); - } - - } while (pud_index(va) || pmd_index(va)); - xen_cleanmfnmap_free_pgtbl(pud_page, unpin); -} - -static void __init xen_pagetable_p2m_free(void) -{ - unsigned long size; - unsigned long addr; - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - - /* No memory or already called. */ - if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) - return; - - /* using __ka address and sticking INVALID_P2M_ENTRY! */ - memset((void *)xen_start_info->mfn_list, 0xff, size); - - addr = xen_start_info->mfn_list; - /* - * We could be in __ka space. - * We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or - * xen_start_info->shared_info they are in going to crash. Fortunatly - * we have already revectored in xen_setup_kernel_pagetable and in - * xen_setup_shared_info. - */ - size = roundup(size, PMD_SIZE); - - if (addr >= __START_KERNEL_map) { - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * - sizeof(unsigned long)); - memblock_free(__pa(addr), size); - } else { - xen_cleanmfnmap(addr); - } -} - -static void __init xen_pagetable_cleanhighmap(void) -{ - unsigned long size; - unsigned long addr; - - /* At this stage, cleanup_highmap has already cleaned __ka space - * from _brk_limit way up to the max_pfn_mapped (which is the end of - * the ramdisk). We continue on, erasing PMD entries that point to page - * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if - * anybody is using __ka address to the initial boot-stack - and try - * to use it - they are going to crash. The xen_start_info has been - * taken care of already in xen_setup_kernel_pagetable. */ - addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); - - xen_cleanhighmap(addr, addr + size); - xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif -} -#endif - -static void __init xen_pagetable_p2m_setup(void) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - xen_vmalloc_p2m_tree(); - -#ifdef CONFIG_X86_64 - xen_pagetable_p2m_free(); - - xen_pagetable_cleanhighmap(); -#endif - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; -} - -static void __init xen_pagetable_init(void) -{ - paging_init(); - xen_post_allocator_init(); - - xen_pagetable_p2m_setup(); - - /* Allocate and initialize top and mid mfn levels for p2m structure */ - xen_build_mfn_list_list(); - - /* Remap memory freed due to conflicts with E820 map */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_remap_memory(); - - xen_setup_shared_info(); -} -static void xen_write_cr2(unsigned long cr2) -{ - this_cpu_read(xen_vcpu)->arch.cr2 = cr2; -} - -static unsigned long xen_read_cr2(void) -{ - return this_cpu_read(xen_vcpu)->arch.cr2; -} - -unsigned long xen_read_cr2_direct(void) -{ - return this_cpu_read(xen_vcpu_info.arch.cr2); -} - void xen_flush_tlb_all(void) { struct mmuext_op *op; @@ -1331,1437 +61,6 @@ void xen_flush_tlb_all(void) preempt_enable(); } -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - trace_xen_mmu_flush_tlb(0); - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_single(unsigned long addr) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - trace_xen_mmu_flush_tlb_single(addr); - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - struct { - struct mmuext_op op; -#ifdef CONFIG_SMP - DECLARE_BITMAP(mask, num_processors); -#else - DECLARE_BITMAP(mask, NR_CPUS); -#endif - } *args; - struct multicall_space mcs; - - trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); - - if (cpumask_empty(cpus)) - return; /* nothing to do */ - - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->op.arg2.vcpumask = to_cpumask(args->mask); - - /* Remove us, and any offline CPUS. */ - cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = start; - } - - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static unsigned long xen_read_cr3(void) -{ - return this_cpu_read(xen_cr3); -} - -static void set_current_cr3(void *v) -{ - this_cpu_write(xen_current_cr3, (unsigned long)v); -} - -static void __xen_write_cr3(bool kernel, unsigned long cr3) -{ - struct mmuext_op op; - unsigned long mfn; - - trace_xen_mmu_write_cr3(kernel, cr3); - - if (cr3) - mfn = pfn_to_mfn(PFN_DOWN(cr3)); - else - mfn = 0; - - WARN_ON(mfn == 0 && kernel); - - op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; - op.arg1.mfn = mfn; - - xen_extend_mmuext_op(&op); - - if (kernel) { - this_cpu_write(xen_cr3, cr3); - - /* Update xen_current_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); - } -} -static void xen_write_cr3(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - this_cpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); - if (user_pgd) - __xen_write_cr3(false, __pa(user_pgd)); - else - __xen_write_cr3(false, 0); - } -#endif - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} - -#ifdef CONFIG_X86_64 -/* - * At the start of the day - when Xen launches a guest, it has already - * built pagetables for the guest. We diligently look over them - * in xen_setup_kernel_pagetable and graft as appropriate them in the - * init_level4_pgt and its friends. Then when we are happy we load - * the new init_level4_pgt - and continue on. - * - * The generic code starts (start_kernel) and 'init_mem_mapping' sets - * up the rest of the pagetables. When it has completed it loads the cr3. - * N.B. that baremetal would start at 'start_kernel' (and the early - * #PF handler would create bootstrap pagetables) - so we are running - * with the same assumptions as what to do when write_cr3 is executed - * at this point. - * - * Since there are no user-page tables at all, we have two variants - * of xen_write_cr3 - the early bootup (this one), and the late one - * (xen_write_cr3). The reason we have to do that is that in 64-bit - * the Linux kernel and user-space are both in ring 3 while the - * hypervisor is in ring 0. - */ -static void __init xen_write_cr3_init(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - this_cpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} -#endif - -static int xen_pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = mm->pgd; - int ret = 0; - - BUG_ON(PagePinned(virt_to_page(pgd))); - -#ifdef CONFIG_X86_64 - { - struct page *page = virt_to_page(pgd); - pgd_t *user_pgd; - - BUG_ON(page->private != 0); - - ret = -ENOMEM; - - user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - page->private = (unsigned long)user_pgd; - - if (user_pgd != NULL) { -#ifdef CONFIG_X86_VSYSCALL_EMULATION - user_pgd[pgd_index(VSYSCALL_ADDR)] = - __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); -#endif - ret = 0; - } - - BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); - } -#endif - - return ret; -} - -static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) - free_page((unsigned long)user_pgd); -#endif -} - -/* - * Init-time set_pte while constructing initial pagetables, which - * doesn't allow RO page table pages to be remapped RW. - * - * If there is no MFN for this PFN then this page is initially - * ballooned out so clear the PTE (as in decrease_reservation() in - * drivers/xen/balloon.c). - * - * Many of these PTE updates are done on unpinned and writable pages - * and doing a hypercall for these is unnecessary and expensive. At - * this point it is not possible to tell if a page is pinned or not, - * so always write the PTE directly and rely on Xen trapping and - * emulating any updates as necessary. - */ -__visible pte_t xen_make_pte_init(pteval_t pte) -{ -#ifdef CONFIG_X86_64 - unsigned long pfn; - - /* - * Pages belonging to the initial p2m list mapped outside the default - * address range must be mapped read-only. This region contains the - * page tables for mapping the p2m list, too, and page tables MUST be - * mapped read-only. - */ - pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; - if (xen_start_info->mfn_list < __START_KERNEL_map && - pfn >= xen_start_info->first_p2m_pfn && - pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) - pte &= ~_PAGE_RW; -#endif - pte = pte_pfn_to_mfn(pte); - return native_make_pte(pte); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); - -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) -{ -#ifdef CONFIG_X86_32 - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_mfn(pte) != INVALID_P2M_ENTRY - && pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); -#endif - native_set_pte(ptep, pte); -} - -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); -} - -/* Used for pmd and pud */ -static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} - -/* Early release_pte assumes that all pts are pinned, since there's - only init_mm and anything attached to that is pinned. */ -static void __init xen_release_pte_init(unsigned long pfn) -{ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static void __init xen_release_pmd_init(unsigned long pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct multicall_space mcs; - struct mmuext_op *op; - - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = cmd; - op->arg1.mfn = pfn_to_mfn(pfn); - - MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); -} - -static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) -{ - struct multicall_space mcs; - unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); - - mcs = __xen_mc_entry(0); - MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, - pfn_pte(pfn, prot), 0); -} - -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, - unsigned level) -{ - bool pinned = PagePinned(virt_to_page(mm->pgd)); - - trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); - - if (pinned) { - struct page *page = pfn_to_page(pfn); - - SetPagePinned(page); - - if (!PageHighMem(page)) { - xen_mc_batch(); - - __set_pfn_prot(pfn, PAGE_KERNEL_RO); - - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) - __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - } else { - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); - } - } -} - -static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PTE); -} - -static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PMD); -} - -/* This should never happen until we're OK to use struct page */ -static inline void xen_release_ptpage(unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - bool pinned = PagePinned(page); - - trace_xen_mmu_release_ptpage(pfn, level, pinned); - - if (pinned) { - if (!PageHighMem(page)) { - xen_mc_batch(); - - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) - __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - - __set_pfn_prot(pfn, PAGE_KERNEL); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - } - ClearPagePinned(page); - } -} - -static void xen_release_pte(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PTE); -} - -static void xen_release_pmd(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PMD); -} - -#if CONFIG_PGTABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PUD); -} - -static void xen_release_pud(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PUD); -} -#endif - -void __init xen_reserve_top(void) -{ -#ifdef CONFIG_X86_32 - unsigned long top = HYPERVISOR_VIRT_START; - struct xen_platform_parameters pp; - - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - top = pp.virt_start; - - reserve_top_address(-top); -#endif /* CONFIG_X86_32 */ -} - -/* - * Like __va(), but returns address in the kernel mapping (which is - * all we have until the physical memory mapping has been set up. - */ -static void * __init __ka(phys_addr_t paddr) -{ -#ifdef CONFIG_X86_64 - return (void *)(paddr + __START_KERNEL_map); -#else - return __va(paddr); -#endif -} - -/* Convert a machine address to physical address */ -static unsigned long __init m2p(phys_addr_t maddr) -{ - phys_addr_t paddr; - - maddr &= PTE_PFN_MASK; - paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; - - return paddr; -} - -/* Convert a machine address to kernel virtual */ -static void * __init m2v(phys_addr_t maddr) -{ - return __ka(m2p(maddr)); -} - -/* Set the page permissions on an identity-mapped pages */ -static void __init set_page_prot_flags(void *addr, pgprot_t prot, - unsigned long flags) -{ - unsigned long pfn = __pa(addr) >> PAGE_SHIFT; - pte_t pte = pfn_pte(pfn, prot); - - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) - BUG(); -} -static void __init set_page_prot(void *addr, pgprot_t prot) -{ - return set_page_prot_flags(addr, prot, UVMF_NONE); -} -#ifdef CONFIG_X86_32 -static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) -{ - unsigned pmdidx, pteidx; - unsigned ident_pte; - unsigned long pfn; - - level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, - PAGE_SIZE); - - ident_pte = 0; - pfn = 0; - for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { - pte_t *pte_page; - - /* Reuse or allocate a page of ptes */ - if (pmd_present(pmd[pmdidx])) - pte_page = m2v(pmd[pmdidx].pmd); - else { - /* Check for free pte pages */ - if (ident_pte == LEVEL1_IDENT_ENTRIES) - break; - - pte_page = &level1_ident_pgt[ident_pte]; - ident_pte += PTRS_PER_PTE; - - pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - } - - /* Install mappings */ - for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { - pte_t pte; - - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - - if (!pte_none(pte_page[pteidx])) - continue; - - pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); - pte_page[pteidx] = pte; - } - } - - for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) - set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); - - set_page_prot(pmd, PAGE_KERNEL_RO); -} -#endif -void __init xen_setup_machphys_mapping(void) -{ - struct xen_machphys_mapping mapping; - - if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { - machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr = mapping.max_mfn + 1; - } else { - machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; - } -#ifdef CONFIG_X86_32 - WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) - < machine_to_phys_mapping); -#endif -} - -#ifdef CONFIG_X86_64 -static void __init convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for (i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} -static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, - unsigned long addr) -{ - if (*pt_base == PFN_DOWN(__pa(addr))) { - set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); - clear_page((void *)addr); - (*pt_base)++; - } - if (*pt_end == PFN_DOWN(__pa(addr))) { - set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); - clear_page((void *)addr); - (*pt_end)--; - } -} -/* - * Set up the initial kernel pagetable. - * - * We can construct this by grafting the Xen provided pagetable into - * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, and level2_kernel_pgt. This means that only the - * kernel has a physical mapping to start with - but that's enough to - * get __va working. We need to fill in the rest of the physical - * mapping once some sort of allocator has been set up. - */ -void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) -{ - pud_t *l3; - pmd_t *l2; - unsigned long addr[3]; - unsigned long pt_base, pt_end; - unsigned i; - - /* max_pfn_mapped is the last pfn mapped in the initial memory - * mappings. Considering that on Xen after the kernel mappings we - * have the mappings of some pages that don't exist in pfn space, we - * set max_pfn_mapped to the last real pfn mapped. */ - if (xen_start_info->mfn_list < __START_KERNEL_map) - max_pfn_mapped = xen_start_info->first_p2m_pfn; - else - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); - - pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); - pt_end = pt_base + xen_start_info->nr_pt_frames; - - /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); - - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Pre-constructed entries are in pfn, so convert to mfn */ - /* L4[272] -> level3_ident_pgt - * L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); - - /* L3_i[0] -> level2_ident_pgt */ - convert_pfn_mfn(level3_ident_pgt); - /* L3_k[510] -> level2_kernel_pgt - * L3_k[511] -> level2_fixmap_pgt */ - convert_pfn_mfn(level3_kernel_pgt); - - /* L3_k[511][506] -> level1_fixmap_pgt */ - convert_pfn_mfn(level2_fixmap_pgt); - } - /* We get [511][511] and have Xen's version of level2_kernel_pgt */ - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - - addr[0] = (unsigned long)pgd; - addr[1] = (unsigned long)l3; - addr[2] = (unsigned long)l2; - /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: - * Both L4[272][0] and L4[511][510] have entries that point to the same - * L2 (PMD) tables. Meaning that if you modify it in __va space - * it will be also modified in the __ka space! (But if you just - * modify the PMD table to point to other PTE's or none, then you - * are OK - which is what cleanup_highmap does) */ - copy_page(level2_ident_pgt, l2); - /* Graft it onto L4[511][510] */ - copy_page(level2_kernel_pgt, l2); - - /* Copy the initial P->M table mappings if necessary. */ - i = pgd_index(xen_start_info->mfn_list); - if (i && i < pgd_index(__START_KERNEL_map)) - init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; - - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - } else - native_write_cr3(__pa(init_level4_pgt)); - - /* We can't that easily rip out L3 and L2, as the Xen pagetables are - * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for - * the initial domain. For guests using the toolstack, they are in: - * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only - * rip out the [L4] (pgd), but for guests we shave off three pages. - */ - for (i = 0; i < ARRAY_SIZE(addr); i++) - check_pt_base(&pt_base, &pt_end, addr[i]); - - /* Our (by three pages) smaller Xen pagetable that we are using */ - xen_pt_base = PFN_PHYS(pt_base); - xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; - memblock_reserve(xen_pt_base, xen_pt_size); - - /* Revector the xen_start_info */ - xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); -} - -/* - * Read a value from a physical address. - */ -static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) -{ - unsigned long *vaddr; - unsigned long val; - - vaddr = early_memremap_ro(addr, sizeof(val)); - val = *vaddr; - early_memunmap(vaddr, sizeof(val)); - return val; -} - -/* - * Translate a virtual address to a physical one without relying on mapped - * page tables. - */ -static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) -{ - phys_addr_t pa; - pgd_t pgd; - pud_t pud; - pmd_t pmd; - pte_t pte; - - pa = read_cr3(); - pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * - sizeof(pgd))); - if (!pgd_present(pgd)) - return 0; - - pa = pgd_val(pgd) & PTE_PFN_MASK; - pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * - sizeof(pud))); - if (!pud_present(pud)) - return 0; - pa = pud_pfn(pud) << PAGE_SHIFT; - if (pud_large(pud)) - return pa + (vaddr & ~PUD_MASK); - - pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * - sizeof(pmd))); - if (!pmd_present(pmd)) - return 0; - pa = pmd_pfn(pmd) << PAGE_SHIFT; - if (pmd_large(pmd)) - return pa + (vaddr & ~PMD_MASK); - - pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * - sizeof(pte))); - if (!pte_present(pte)) - return 0; - pa = pte_pfn(pte) << PAGE_SHIFT; - - return pa | (vaddr & ~PAGE_MASK); -} - -/* - * Find a new area for the hypervisor supplied p2m list and relocate the p2m to - * this area. - */ -void __init xen_relocate_p2m(void) -{ - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; - unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; - pte_t *pt; - pmd_t *pmd; - pud_t *pud; - pgd_t *pgd; - unsigned long *new_p2m; - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; - n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; - n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; - n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - n_frames = n_pte + n_pt + n_pmd + n_pud; - - new_area = xen_find_free_area(PFN_PHYS(n_frames)); - if (!new_area) { - xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); - BUG(); - } - - /* - * Setup the page tables for addressing the new p2m list. - * We have asked the hypervisor to map the p2m list at the user address - * PUD_SIZE. It may have done so, or it may have used a kernel space - * address depending on the Xen version. - * To avoid any possible virtual address collision, just use - * 2 * PUD_SIZE for the new area. - */ - pud_phys = new_area; - pmd_phys = pud_phys + PFN_PHYS(n_pud); - pt_phys = pmd_phys + PFN_PHYS(n_pmd); - p2m_pfn = PFN_DOWN(pt_phys) + n_pt; - - pgd = __va(read_cr3()); - new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; - } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; - } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; - } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; - } - - /* Now copy the old p2m info to the new area. */ - memcpy(new_p2m, xen_p2m_addr, size); - xen_p2m_addr = new_p2m; - - /* Release the old p2m list and set new list info. */ - p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); - BUG_ON(!p2m_pfn); - p2m_pfn_end = p2m_pfn + PFN_DOWN(size); - - if (xen_start_info->mfn_list < __START_KERNEL_map) { - pfn = xen_start_info->first_p2m_pfn; - pfn_end = xen_start_info->first_p2m_pfn + - xen_start_info->nr_p2m_frames; - set_pgd(pgd + 1, __pgd(0)); - } else { - pfn = p2m_pfn; - pfn_end = p2m_pfn_end; - } - - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); - while (pfn < pfn_end) { - if (pfn == p2m_pfn) { - pfn = p2m_pfn_end; - continue; - } - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - pfn++; - } - - xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; - xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); - xen_start_info->nr_p2m_frames = n_frames; -} - -#else /* !CONFIG_X86_64 */ -static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); -static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); - -static void __init xen_write_cr3_init(unsigned long cr3) -{ - unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); - - BUG_ON(read_cr3() != __pa(initial_page_table)); - BUG_ON(cr3 != __pa(swapper_pg_dir)); - - /* - * We are switching to swapper_pg_dir for the first time (from - * initial_page_table) and therefore need to mark that page - * read-only and then pin it. - * - * Xen disallows sharing of kernel PMDs for PAE - * guests. Therefore we must copy the kernel PMD from - * initial_page_table into a new kernel PMD to be used in - * swapper_pg_dir. - */ - swapper_kernel_pmd = - extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - copy_page(swapper_kernel_pmd, initial_kernel_pmd); - swapper_pg_dir[KERNEL_PGD_BOUNDARY] = - __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); - set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); - - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); - xen_write_cr3(cr3); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(initial_page_table))); - set_page_prot(initial_page_table, PAGE_KERNEL); - set_page_prot(initial_kernel_pmd, PAGE_KERNEL); - - pv_mmu_ops.write_cr3 = &xen_write_cr3; -} - -/* - * For 32 bit domains xen_start_info->pt_base is the pgd address which might be - * not the first page table in the page table pool. - * Iterate through the initial page tables to find the real page table base. - */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) -{ - phys_addr_t pt_base, paddr; - unsigned pmdidx; - - pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); - - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) - if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { - paddr = m2p(pmd[pmdidx].pmd); - pt_base = min(pt_base, paddr); - } - - return pt_base; -} - -void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) -{ - pmd_t *kernel_pmd; - - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - - xen_pt_base = xen_find_pt_base(kernel_pmd); - xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; - - initial_kernel_pmd = - extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - - max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - - copy_page(initial_kernel_pmd, kernel_pmd); - - xen_map_identity_early(initial_kernel_pmd, max_pfn); - - copy_page(initial_page_table, pgd); - initial_page_table[KERNEL_PGD_BOUNDARY] = - __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); - - set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); - set_page_prot(initial_page_table, PAGE_KERNEL_RO); - set_page_prot(empty_zero_page, PAGE_KERNEL_RO); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, - PFN_DOWN(__pa(initial_page_table))); - xen_write_cr3(__pa(initial_page_table)); - - memblock_reserve(xen_pt_base, xen_pt_size); -} -#endif /* CONFIG_X86_64 */ - -void __init xen_reserve_special_pages(void) -{ - phys_addr_t paddr; - - memblock_reserve(__pa(xen_start_info), PAGE_SIZE); - if (xen_start_info->store_mfn) { - paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); - memblock_reserve(paddr, PAGE_SIZE); - } - if (!xen_initial_domain()) { - paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); - memblock_reserve(paddr, PAGE_SIZE); - } -} - -void __init xen_pt_check_e820(void) -{ - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } -} - -static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; - -static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) -{ - pte_t pte; - - phys >>= PAGE_SHIFT; - - switch (idx) { - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: - case FIX_RO_IDT: -#ifdef CONFIG_X86_32 - case FIX_WP_TEST: -# ifdef CONFIG_HIGHMEM - case FIX_KMAP_BEGIN ... FIX_KMAP_END: -# endif -#elif defined(CONFIG_X86_VSYSCALL_EMULATION) - case VSYSCALL_PAGE: -#endif - case FIX_TEXT_POKE0: - case FIX_TEXT_POKE1: - /* All local page mappings */ - pte = pfn_pte(phys, prot); - break; - -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ - pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); - break; -#endif - -#ifdef CONFIG_X86_IO_APIC - case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: - /* - * We just don't map the IO APIC - all access is via - * hypercalls. Keep the address in the pte for reference. - */ - pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); - break; -#endif - - case FIX_PARAVIRT_BOOTMAP: - /* This is an MFN, but it isn't an IO mapping from the - IO domain */ - pte = mfn_pte(phys, prot); - break; - - default: - /* By default, set_fixmap is used for hardware mappings */ - pte = mfn_pte(phys, prot); - break; - } - - __native_set_fixmap(idx, pte); - -#ifdef CONFIG_X86_VSYSCALL_EMULATION - /* Replicate changes to map the vsyscall page into the user - pagetable vsyscall mapping. */ - if (idx == VSYSCALL_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); - set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } -#endif -} - -static void __init xen_post_allocator_init(void) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; -#endif - - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); - -#ifdef CONFIG_X86_64 - pv_mmu_ops.write_cr3 = &xen_write_cr3; - SetPagePinned(virt_to_page(level3_user_vsyscall)); -#endif - xen_mark_init_mm_pinned(); -} - -static void xen_leave_lazy_mmu(void) -{ - preempt_disable(); - xen_mc_flush(); - paravirt_leave_lazy_mmu(); - preempt_enable(); -} - -static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, - - .set_pte = xen_set_pte_init, - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd_hyper, - - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, - - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, -#endif /* CONFIG_X86_PAE */ - .set_pud = xen_set_pud_hyper, - - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - -#if CONFIG_PGTABLE_LEVELS == 4 - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_pgd = xen_set_pgd_hyper, - - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, - }, - - .set_fixmap = xen_set_fixmap, -}; - -void __init xen_init_mmu_ops(void) -{ - x86_init.paging.pagetable_init = xen_pagetable_init; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - pv_mmu_ops = xen_mmu_ops; - - memset(dummy_mapping, 0xff, PAGE_SIZE); -} - -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; - -#define VOID_PTE (mfn_pte(0, __pgprot(0))) -static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, - unsigned long *in_frames, - unsigned long *out_frames) -{ - int i; - struct multicall_space mcs; - - xen_mc_batch(); - for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { - mcs = __xen_mc_entry(0); - - if (in_frames) - in_frames[i] = virt_to_mfn(vaddr); - - MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); - - if (out_frames) - out_frames[i] = virt_to_pfn(vaddr); - } - xen_mc_issue(0); -} - -/* - * Update the pfn-to-mfn mappings for a virtual address range, either to - * point to an array of mfns, or contiguously from a single starting - * mfn. - */ -static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, - unsigned long *mfns, - unsigned long first_mfn) -{ - unsigned i, limit; - unsigned long mfn; - - xen_mc_batch(); - - limit = 1u << order; - for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { - struct multicall_space mcs; - unsigned flags; - - mcs = __xen_mc_entry(0); - if (mfns) - mfn = mfns[i]; - else - mfn = first_mfn + i; - - if (i < (limit - 1)) - flags = 0; - else { - if (order == 0) - flags = UVMF_INVLPG | UVMF_ALL; - else - flags = UVMF_TLB_FLUSH | UVMF_ALL; - } - - MULTI_update_va_mapping(mcs.mc, vaddr, - mfn_pte(mfn, PAGE_KERNEL), flags); - - set_phys_to_machine(virt_to_pfn(vaddr), mfn); - } - - xen_mc_issue(0); -} - -/* - * Perform the hypercall to exchange a region of our pfns to point to - * memory with the required contiguous alignment. Takes the pfns as - * input, and populates mfns as output. - * - * Returns a success code indicating whether the hypervisor was able to - * satisfy the request or not. - */ -static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, - unsigned long *pfns_in, - unsigned long extents_out, - unsigned int order_out, - unsigned long *mfns_out, - unsigned int address_bits) -{ - long rc; - int success; - - struct xen_memory_exchange exchange = { - .in = { - .nr_extents = extents_in, - .extent_order = order_in, - .extent_start = pfns_in, - .domid = DOMID_SELF - }, - .out = { - .nr_extents = extents_out, - .extent_order = order_out, - .extent_start = mfns_out, - .address_bits = address_bits, - .domid = DOMID_SELF - } - }; - - BUG_ON(extents_in << order_in != extents_out << order_out); - - rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); - success = (exchange.nr_exchanged == extents_in); - - BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); - BUG_ON(success && (rc != 0)); - - return success; -} - -int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, - unsigned int address_bits, - dma_addr_t *dma_handle) -{ - unsigned long *in_frames = discontig_frames, out_frame; - unsigned long flags; - int success; - unsigned long vstart = (unsigned long)phys_to_virt(pstart); - - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; - - memset((void *) vstart, 0, PAGE_SIZE << order); - - spin_lock_irqsave(&xen_reservation_lock, flags); - - /* 1. Zap current PTEs, remembering MFNs. */ - xen_zap_pfn_range(vstart, order, in_frames, NULL); - - /* 2. Get a new contiguous memory extent. */ - out_frame = virt_to_pfn(vstart); - success = xen_exchange_memory(1UL << order, 0, in_frames, - 1, order, &out_frame, - address_bits); - - /* 3. Map the new extent in place of old pages. */ - if (success) - xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); - else - xen_remap_exchanged_ptes(vstart, order, in_frames, 0); - - spin_unlock_irqrestore(&xen_reservation_lock, flags); - - *dma_handle = virt_to_machine(vstart).maddr; - return success ? 0 : -ENOMEM; -} -EXPORT_SYMBOL_GPL(xen_create_contiguous_region); - -void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) -{ - unsigned long *out_frames = discontig_frames, in_frame; - unsigned long flags; - int success; - unsigned long vstart; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - if (unlikely(order > MAX_CONTIG_ORDER)) - return; - - vstart = (unsigned long)phys_to_virt(pstart); - memset((void *) vstart, 0, PAGE_SIZE << order); - - spin_lock_irqsave(&xen_reservation_lock, flags); - - /* 1. Find start MFN of contiguous extent. */ - in_frame = virt_to_mfn(vstart); - - /* 2. Zap current PTEs. */ - xen_zap_pfn_range(vstart, order, NULL, out_frames); - - /* 3. Do the exchange for non-contiguous MFNs. */ - success = xen_exchange_memory(1, order, &in_frame, 1UL << order, - 0, out_frames, 0); - - /* 4. Map new pages in place of old pages. */ - if (success) - xen_remap_exchanged_ptes(vstart, order, out_frames, 0); - else - xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); - - spin_unlock_irqrestore(&xen_reservation_lock, flags); -} -EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); - -#ifdef CONFIG_XEN_PVHVM -#ifdef CONFIG_PROC_VMCORE -/* - * This function is used in two contexts: - * - the kdump kernel has to check whether a pfn of the crashed kernel - * was a ballooned page. vmcore is using this function to decide - * whether to access a pfn of the crashed kernel. - * - the kexec kernel has to check whether a pfn was ballooned by the - * previous kernel. If the pfn is ballooned, handle it properly. - * Returns 0 if the pfn is not backed by a RAM page, the caller may - * handle the pfn special in this case. - */ -static int xen_oldmem_pfn_is_ram(unsigned long pfn) -{ - struct xen_hvm_get_mem_type a = { - .domid = DOMID_SELF, - .pfn = pfn, - }; - int ram; - - if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) - return -ENXIO; - - switch (a.mem_type) { - case HVMMEM_mmio_dm: - ram = 0; - break; - case HVMMEM_ram_rw: - case HVMMEM_ram_ro: - default: - ram = 1; - break; - } - - return ram; -} -#endif - -static void xen_hvm_exit_mmap(struct mm_struct *mm) -{ - struct xen_hvm_pagetable_dying a; - int rc; - - a.domid = DOMID_SELF; - a.gpa = __pa(mm->pgd); - rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); - WARN_ON_ONCE(rc < 0); -} - -static int is_pagetable_dying_supported(void) -{ - struct xen_hvm_pagetable_dying a; - int rc = 0; - - a.domid = DOMID_SELF; - a.gpa = 0x00; - rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); - if (rc < 0) { - printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); - return 0; - } - return 1; -} - -void __init xen_hvm_init_mmu_ops(void) -{ - if (is_pagetable_dying_supported()) - pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; -#ifdef CONFIG_PROC_VMCORE - register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); -#endif -} -#endif #define REMAP_BATCH_SIZE 16 @@ -2892,7 +191,6 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); - /* Returns: 0 success */ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages) diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb951b4..3fe2b3292915 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -5,6 +5,7 @@ enum pt_level { PT_PGD, + PT_P4D, PT_PUD, PT_PMD, PT_PTE diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c new file mode 100644 index 000000000000..1c57f1cd545c --- /dev/null +++ b/arch/x86/xen/mmu_hvm.c @@ -0,0 +1,79 @@ +#include <linux/types.h> +#include <linux/crash_dump.h> + +#include <xen/interface/xen.h> +#include <xen/hvm.h> + +#include "mmu.h" + +#ifdef CONFIG_PROC_VMCORE +/* + * This function is used in two contexts: + * - the kdump kernel has to check whether a pfn of the crashed kernel + * was a ballooned page. vmcore is using this function to decide + * whether to access a pfn of the crashed kernel. + * - the kexec kernel has to check whether a pfn was ballooned by the + * previous kernel. If the pfn is ballooned, handle it properly. + * Returns 0 if the pfn is not backed by a RAM page, the caller may + * handle the pfn special in this case. + */ +static int xen_oldmem_pfn_is_ram(unsigned long pfn) +{ + struct xen_hvm_get_mem_type a = { + .domid = DOMID_SELF, + .pfn = pfn, + }; + int ram; + + if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) + return -ENXIO; + + switch (a.mem_type) { + case HVMMEM_mmio_dm: + ram = 0; + break; + case HVMMEM_ram_rw: + case HVMMEM_ram_ro: + default: + ram = 1; + break; + } + + return ram; +} +#endif + +static void xen_hvm_exit_mmap(struct mm_struct *mm) +{ + struct xen_hvm_pagetable_dying a; + int rc; + + a.domid = DOMID_SELF; + a.gpa = __pa(mm->pgd); + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + WARN_ON_ONCE(rc < 0); +} + +static int is_pagetable_dying_supported(void) +{ + struct xen_hvm_pagetable_dying a; + int rc = 0; + + a.domid = DOMID_SELF; + a.gpa = 0x00; + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + if (rc < 0) { + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); + return 0; + } + return 1; +} + +void __init xen_hvm_init_mmu_ops(void) +{ + if (is_pagetable_dying_supported()) + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +#ifdef CONFIG_PROC_VMCORE + register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); +#endif +} diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c new file mode 100644 index 000000000000..9d9ae6650aa1 --- /dev/null +++ b/arch/x86/xen/mmu_pv.c @@ -0,0 +1,2730 @@ +/* + * Xen mmu operations + * + * This file contains the various mmu fetch and update operations. + * The most important job they must perform is the mapping between the + * domain's pfn and the overall machine mfns. + * + * Xen allows guests to directly update the pagetable, in a controlled + * fashion. In other words, the guest modifies the same pagetable + * that the CPU actually uses, which eliminates the overhead of having + * a separate shadow pagetable. + * + * In order to allow this, it falls on the guest domain to map its + * notion of a "physical" pfn - which is just a domain-local linear + * address - into a real "machine address" which the CPU's MMU can + * use. + * + * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be + * inserted directly into the pagetable. When creating a new + * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, + * when reading the content back with __(pgd|pmd|pte)_val, it converts + * the mfn back into a pfn. + * + * The other constraint is that all pages which make up a pagetable + * must be mapped read-only in the guest. This prevents uncontrolled + * guest updates to the pagetable. Xen strictly enforces this, and + * will disallow any pagetable update which will end up mapping a + * pagetable page RW, and will disallow using any writable page as a + * pagetable. + * + * Naively, when loading %cr3 with the base of a new pagetable, Xen + * would need to validate the whole pagetable before going on. + * Naturally, this is quite slow. The solution is to "pin" a + * pagetable, which enforces all the constraints on the pagetable even + * when it is not actively in use. This menas that Xen can be assured + * that it is still valid when you do load it into %cr3, and doesn't + * need to revalidate it. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/sched/mm.h> +#include <linux/highmem.h> +#include <linux/debugfs.h> +#include <linux/bug.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> +#ifdef CONFIG_KEXEC_CORE +#include <linux/kexec.h> +#endif + +#include <trace/events/xen.h> + +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/mmu_context.h> +#include <asm/setup.h> +#include <asm/paravirt.h> +#include <asm/e820/api.h> +#include <asm/linkage.h> +#include <asm/page.h> +#include <asm/init.h> +#include <asm/pat.h> +#include <asm/smp.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/version.h> +#include <xen/interface/memory.h> +#include <xen/hvc-console.h> + +#include "multicalls.h" +#include "mmu.h" +#include "debugfs.h" + +#ifdef CONFIG_X86_32 +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); +#endif +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ + +static phys_addr_t xen_pt_base, xen_pt_size __initdata; + +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + unsigned int level; + + pte = lookup_address(address, &level); + if (pte == NULL) + return; /* vaddr missing */ + + ptev = pte_wrprotect(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + unsigned int level; + + pte = lookup_address(address, &level); + if (pte == NULL) + return; /* vaddr missing */ + + ptev = pte_mkwrite(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +static bool xen_page_pinned(void *ptr) +{ + struct page *page = virt_to_page(ptr); + + return PagePinned(page); +} + +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) +{ + struct multicall_space mcs; + struct mmu_update *u; + + trace_xen_mmu_set_domain_pte(ptep, pteval, domid); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + + /* ptep might be kmapped when using 32-bit HIGHPTE */ + u->ptr = virt_to_machine(ptep).maddr; + u->val = pte_val_ma(pteval); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +EXPORT_SYMBOL_GPL(xen_set_domain_pte); + +static void xen_extend_mmu_update(const struct mmu_update *update) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *update; +} + +static void xen_extend_mmuext_op(const struct mmuext_op *op) +{ + struct multicall_space mcs; + struct mmuext_op *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *op; +} + +static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + trace_xen_mmu_set_pmd(ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pmd_hyper(ptr, val); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); +} + +static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) +{ + struct mmu_update u; + + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + return false; + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + return true; +} + +static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) +{ + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } +} + +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ + trace_xen_mmu_set_pte(ptep, pteval); + __xen_set_pte(ptep, pteval); +} + +static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); + __xen_set_pte(ptep, pteval); +} + +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + /* Just return the pte as-is. We preserve the bits on commit */ + trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); + return *ptep; +} + +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct mmu_update u; + + trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.val = pte_val_ma(pte); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long pfn = mfn_to_pfn(mfn); + + pteval_t flags = val & PTE_FLAGS_MASK; + if (unlikely(pfn == ~0)) + val = flags & ~_PAGE_PRESENT; + else + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; +} + +static pteval_t pte_pfn_to_mfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + unsigned long mfn; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = __pfn_to_mfn(pfn); + else + mfn = pfn; + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } else + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; + } + + return val; +} + +__visible pteval_t xen_pte_val(pte_t pte) +{ + pteval_t pteval = pte.pte; + + return pte_mfn_to_pfn(pteval); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); + +__visible pgdval_t xen_pgd_val(pgd_t pgd) +{ + return pte_mfn_to_pfn(pgd.pgd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); + +__visible pte_t xen_make_pte(pteval_t pte) +{ + pte = pte_pfn_to_mfn(pte); + + return native_make_pte(pte); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); + +__visible pgd_t xen_make_pgd(pgdval_t pgd) +{ + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); + +__visible pmdval_t xen_pmd_val(pmd_t pmd) +{ + return pte_mfn_to_pfn(pmd.pmd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); + +static void xen_set_pud_hyper(pud_t *ptr, pud_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_pud(pud_t *ptr, pud_t val) +{ + trace_xen_mmu_set_pud(ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pud_hyper(ptr, val); +} + +#ifdef CONFIG_X86_PAE +static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + trace_xen_mmu_set_pte_atomic(ptep, pte); + set_64bit((u64 *)ptep, native_pte_val(pte)); +} + +static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + trace_xen_mmu_pte_clear(mm, addr, ptep); + if (!xen_batched_set_pte(ptep, native_make_pte(0))) + native_pte_clear(mm, addr, ptep); +} + +static void xen_pmd_clear(pmd_t *pmdp) +{ + trace_xen_mmu_pmd_clear(pmdp); + set_pmd(pmdp, __pmd(0)); +} +#endif /* CONFIG_X86_PAE */ + +__visible pmd_t xen_make_pmd(pmdval_t pmd) +{ + pmd = pte_pfn_to_mfn(pmd); + return native_make_pmd(pmd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); + +#if CONFIG_PGTABLE_LEVELS == 4 +__visible pudval_t xen_pud_val(pud_t pud) +{ + return pte_mfn_to_pfn(pud.pud); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); + +__visible pud_t xen_make_pud(pudval_t pud) +{ + pud = pte_pfn_to_mfn(pud); + + return native_make_pud(pud); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); + +static pgd_t *xen_get_user_pgd(pgd_t *pgd) +{ + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; + + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } + + return user_ptr; +} + +static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = p4d_val_ma(val); + xen_extend_mmu_update(&u); +} + +/* + * Raw hypercall-based set_p4d, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) +{ + preempt_disable(); + + xen_mc_batch(); + + __xen_set_p4d_hyper(ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_p4d(p4d_t *ptr, p4d_t val) +{ + pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); + pgd_t pgd_val; + + trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + if (user_ptr) { + WARN_ON(xen_page_pinned(user_ptr)); + pgd_val.pgd = p4d_val_ma(val); + *user_ptr = pgd_val; + } + return; + } + + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_p4d_hyper(ptr, val); + if (user_ptr) + __xen_set_p4d_hyper((p4d_t *)user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + +static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; + for (i = 0; i < nr; i++) { + if (!pmd_none(pmd[i])) + flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); + } + return flush; +} + +static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; + for (i = 0; i < nr; i++) { + pmd_t *pmd; + + if (pud_none(pud[i])) + continue; + + pmd = pmd_offset(&pud[i], 0); + if (PTRS_PER_PMD > 1) + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); + flush |= xen_pmd_walk(mm, pmd, func, + last && i == nr - 1, limit); + } + return flush; +} + +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; + for (i = 0; i < nr; i++) { + pud_t *pud; + + if (p4d_none(p4d[i])) + continue; + + pud = pud_offset(&p4d[i], 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, + last && i == nr - 1, limit); + } + return flush; +} + +/* + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ + int i, nr, flush = 0; + unsigned hole_low, hole_high; + + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(USER_LIMIT); + hole_high = pgd_index(PAGE_OFFSET); + + nr = pgd_index(limit) + 1; + for (i = 0; i < nr; i++) { + p4d_t *p4d; + + if (i >= hole_low && i < hole_high) + continue; + + if (pgd_none(pgd[i])) + continue; + + p4d = p4d_offset(&pgd[i], 0); + if (PTRS_PER_P4D > 1) + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); + } + + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); + + return flush; +} + +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ + return __xen_pgd_walk(mm, mm->pgd, func, limit); +} + +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) +{ + spinlock_t *ptl = NULL; + +#if USE_SPLIT_PTE_PTLOCKS + ptl = ptlock_ptr(page); + spin_lock_nest_lock(ptl, &mm->page_table_lock); +#endif + + return ptl; +} + +static void xen_pte_unlock(void *v) +{ + spinlock_t *ptl = v; + spin_unlock(ptl); +} + +static void xen_do_pin(unsigned level, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + + xen_extend_mmuext_op(&op); +} + +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) +{ + unsigned pgfl = TestSetPagePinned(page); + int flush; + + if (pgfl) + flush = 0; /* already pinned */ + else if (PageHighMem(page)) + /* kmaps need flushing if we found an unpinned + highpage */ + flush = 1; + else { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl; + + flush = 0; + + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ + ptl = NULL; + if (level == PT_PTE) + ptl = xen_pte_lock(page, mm); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL_RO), + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); + + /* Queue a deferred unlock for when this batch + is completed. */ + xen_mc_callback(xen_pte_unlock, ptl); + } + } + + return flush; +} + +/* This is called just after a mm has been created, but it has not + been used yet. We need to make sure that its pagetable is all + read-only, and can be pinned. */ +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) +{ + trace_xen_mmu_pgd_pin(mm, pgd); + + xen_mc_batch(); + + if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { + /* re-enable interrupts for flushing */ + xen_mc_issue(0); + + kmap_flush_unused(); + + xen_mc_batch(); + } + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is pinnable */ + xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), + PT_PMD); +#endif + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ + xen_mc_issue(0); +} + +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + +/* + * On save, we need to pin all pagetables to make sure they get their + * mfns turned into pfns. Search the list for any unpinned pgds and pin + * them (unpinned pgds are not currently in use, probably because the + * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. + */ +void xen_mm_pin_all(void) +{ + struct page *page; + + spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); + SetPageSavePinned(page); + } + } + + spin_unlock(&pgd_lock); +} + +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ +static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) +{ + SetPagePinned(page); + return 0; +} + +static void __init xen_mark_init_mm_pinned(void) +{ + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); +} + +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) +{ + unsigned pgfl = TestClearPagePinned(page); + + if (pgfl && !PageHighMem(page)) { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + spinlock_t *ptl = NULL; + struct multicall_space mcs; + + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ + if (level == PT_PTE) { + ptl = xen_pte_lock(page, mm); + + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + } + + mcs = __xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL), + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + /* unlock when batch completed */ + xen_mc_callback(xen_pte_unlock, ptl); + } + } + + return 0; /* never need to flush on unpin */ +} + +/* Release a pagetables pages back as normal RW */ +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) +{ + trace_xen_mmu_pgd_unpin(mm, pgd); + + xen_mc_batch(); + + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); + } + } +#endif + +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), + PT_PMD); +#endif + + __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); + + xen_mc_issue(0); +} + +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + +/* + * On resume, undo any pinning done at save, so that the rest of the + * kernel doesn't see any unexpected pinned pagetables. + */ +void xen_mm_unpin_all(void) +{ + struct page *page; + + spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); + ClearPageSavePinned(page); + } + } + + spin_unlock(&pgd_lock); +} + +static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + spin_lock(&next->page_table_lock); + xen_pgd_pin(next); + spin_unlock(&next->page_table_lock); +} + +static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + xen_pgd_pin(mm); + spin_unlock(&mm->page_table_lock); +} + + +#ifdef CONFIG_SMP +/* Another cpu may still have their %cr3 pointing at the pagetable, so + we need to repoint it somewhere else before we can unpin it. */ +static void drop_other_mm_ref(void *info) +{ + struct mm_struct *mm = info; + struct mm_struct *active_mm; + + active_mm = this_cpu_read(cpu_tlbstate.active_mm); + + if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) + leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ + if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) + load_cr3(swapper_pg_dir); +} + +static void xen_drop_mm_ref(struct mm_struct *mm) +{ + cpumask_var_t mask; + unsigned cpu; + + if (current->active_mm == mm) { + if (current->mm == mm) + load_cr3(swapper_pg_dir); + else + leave_mm(smp_processor_id()); + } + + /* Get the "official" set of cpus referring to our pagetable. */ + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, mm_cpumask(mm)); + + /* It's possible that a vcpu may have a stale reference to our + cr3, because its in lazy mode, and it hasn't yet flushed + its set of pending hypercalls yet. In this case, we can + look at its actual current cr3 value, and force it to flush + if needed. */ + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpumask_set_cpu(cpu, mask); + } + + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); +} +#else +static void xen_drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) + load_cr3(swapper_pg_dir); +} +#endif + +/* + * While a process runs, Xen pins its pagetables, which means that the + * hypervisor forces it to be read-only, and it controls all updates + * to it. This means that all pagetable updates have to go via the + * hypervisor, which is moderately expensive. + * + * Since we're pulling the pagetable down, we switch to use init_mm, + * unpin old process pagetable and mark it all read-write, which + * allows further operations on it to be simple memory accesses. + * + * The only subtle point is that another CPU may be still using the + * pagetable because of lazy tlb flushing. This means we need need to + * switch all CPUs off this pagetable before we can unpin it. + */ +static void xen_exit_mmap(struct mm_struct *mm) +{ + get_cpu(); /* make sure we don't move around */ + xen_drop_mm_ref(mm); + put_cpu(); + + spin_lock(&mm->page_table_lock); + + /* pgd may not be pinned in the error exit path of execve */ + if (xen_page_pinned(mm->pgd)) + xen_pgd_unpin(mm); + + spin_unlock(&mm->page_table_lock); +} + +static void xen_post_allocator_init(void); + +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); + + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} + +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) +{ + unsigned long pa; + pte_t *pte_tbl; + int i; + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + return; + } + + pte_tbl = pte_offset_kernel(pmd, 0); + for (i = 0; i < PTRS_PER_PTE; i++) { + if (pte_none(pte_tbl[i])) + continue; + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + set_pmd(pmd, __pmd(0)); + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); +} + +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) +{ + unsigned long pa; + pmd_t *pmd_tbl; + int i; + + if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + return; + } + + pmd_tbl = pmd_offset(pud, 0); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd_none(pmd_tbl[i])) + continue; + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); + } + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); +} + +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) +{ + unsigned long pa; + pud_t *pud_tbl; + int i; + + if (p4d_large(*p4d)) { + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, P4D_SIZE); + return; + } + + pud_tbl = pud_offset(p4d, 0); + for (i = 0; i < PTRS_PER_PUD; i++) { + if (pud_none(pud_tbl[i])) + continue; + xen_cleanmfnmap_pud(pud_tbl + i, unpin); + } + set_p4d(p4d, __p4d(0)); + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + pgd_t *pgd; + p4d_t *p4d; + unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (p4d_none(p4d[i])) + continue; + xen_cleanmfnmap_p4d(p4d + i, unpin); + } + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(0)); + xen_cleanmfnmap_free_pgtbl(p4d, unpin); + } +} + +static void __init xen_pagetable_p2m_free(void) +{ + unsigned long size; + unsigned long addr; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* No memory or already called. */ + if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) + return; + + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + addr = xen_start_info->mfn_list; + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ + size = roundup(size, PMD_SIZE); + + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; + + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superfluous and is not necessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif +} +#endif + +static void __init xen_pagetable_p2m_setup(void) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + xen_vmalloc_p2m_tree(); + +#ifdef CONFIG_X86_64 + xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); +#endif + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; +} + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_post_allocator_init(); + + xen_pagetable_p2m_setup(); + + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + /* Remap memory freed due to conflicts with E820 map */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_remap_memory(); + + xen_setup_shared_info(); +} +static void xen_write_cr2(unsigned long cr2) +{ + this_cpu_read(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return this_cpu_read(xen_vcpu)->arch.cr2; +} + +unsigned long xen_read_cr2_direct(void) +{ + return this_cpu_read(xen_vcpu_info.arch.cr2); +} + +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_single(addr); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + struct { + struct mmuext_op op; +#ifdef CONFIG_SMP + DECLARE_BITMAP(mask, num_processors); +#else + DECLARE_BITMAP(mask, NR_CPUS); +#endif + } *args; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); + + if (cpumask_empty(cpus)) + return; /* nothing to do */ + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->op.arg2.vcpumask = to_cpumask(args->mask); + + /* Remove us, and any offline CPUS. */ + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); + + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = start; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static unsigned long xen_read_cr3(void) +{ + return this_cpu_read(xen_cr3); +} + +static void set_current_cr3(void *v) +{ + this_cpu_write(xen_current_cr3, (unsigned long)v); +} + +static void __xen_write_cr3(bool kernel, unsigned long cr3) +{ + struct mmuext_op op; + unsigned long mfn; + + trace_xen_mmu_write_cr3(kernel, cr3); + + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; + + WARN_ON(mfn == 0 && kernel); + + op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op.arg1.mfn = mfn; + + xen_extend_mmuext_op(&op); + + if (kernel) { + this_cpu_write(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} + +#ifdef CONFIG_X86_64 +/* + * At the start of the day - when Xen launches a guest, it has already + * built pagetables for the guest. We diligently look over them + * in xen_setup_kernel_pagetable and graft as appropriate them in the + * init_level4_pgt and its friends. Then when we are happy we load + * the new init_level4_pgt - and continue on. + * + * The generic code starts (start_kernel) and 'init_mem_mapping' sets + * up the rest of the pagetables. When it has completed it loads the cr3. + * N.B. that baremetal would start at 'start_kernel' (and the early + * #PF handler would create bootstrap pagetables) - so we are running + * with the same assumptions as what to do when write_cr3 is executed + * at this point. + * + * Since there are no user-page tables at all, we have two variants + * of xen_write_cr3 - the early bootup (this one), and the late one + * (xen_write_cr3). The reason we have to do that is that in 64-bit + * the Linux kernel and user-space are both in ring 3 while the + * hypervisor is in ring 0. + */ +static void __init xen_write_cr3_init(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} +#endif + +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { +#ifdef CONFIG_X86_VSYSCALL_EMULATION + user_pgd[pgd_index(VSYSCALL_ADDR)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); +#endif + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ +__visible pte_t xen_make_pte_init(pteval_t pte) +{ +#ifdef CONFIG_X86_64 + unsigned long pfn; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; + if (xen_start_info->mfn_list < __START_KERNEL_map && + pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte &= ~_PAGE_RW; +#endif + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); + +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +{ +#ifdef CONFIG_X86_32 + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_mfn(pte) != INVALID_P2M_ENTRY + && pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); +#endif + native_set_pte(ptep, pte); +} + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +} + +/* Used for pmd and pud */ +static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* Early release_pte assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void __init xen_release_pte_init(unsigned long pfn) +{ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void __init xen_release_pmd_init(unsigned long pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct multicall_space mcs; + struct mmuext_op *op; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = cmd; + op->arg1.mfn = pfn_to_mfn(pfn); + + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); +} + +static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + struct multicall_space mcs; + unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, + pfn_pte(pfn, prot), 0); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, + unsigned level) +{ + bool pinned = PagePinned(virt_to_page(mm->pgd)); + + trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); + + if (pinned) { + struct page *page = pfn_to_page(pfn); + + SetPagePinned(page); + + if (!PageHighMem(page)) { + xen_mc_batch(); + + __set_pfn_prot(pfn, PAGE_KERNEL_RO); + + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } else { + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } + } +} + +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PTE); +} + +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PMD); +} + +/* This should never happen until we're OK to use struct page */ +static inline void xen_release_ptpage(unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + bool pinned = PagePinned(page); + + trace_xen_mmu_release_ptpage(pfn, level, pinned); + + if (pinned) { + if (!PageHighMem(page)) { + xen_mc_batch(); + + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + + __set_pfn_prot(pfn, PAGE_KERNEL); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } + ClearPagePinned(page); + } +} + +static void xen_release_pte(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pmd(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + +#if CONFIG_PGTABLE_LEVELS >= 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + +void __init xen_reserve_top(void) +{ +#ifdef CONFIG_X86_32 + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void * __init __ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long __init m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void * __init m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +/* Set the page permissions on an identity-mapped pages */ +static void __init set_page_prot_flags(void *addr, pgprot_t prot, + unsigned long flags) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) + BUG(); +} +static void __init set_page_prot(void *addr, pgprot_t prot) +{ + return set_page_prot_flags(addr, prot, UVMF_NONE); +} +#ifdef CONFIG_X86_32 +static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == LEVEL1_IDENT_ENTRIES) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} +#endif +void __init xen_setup_machphys_mapping(void) +{ + struct xen_machphys_mapping mapping; + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr = mapping.max_mfn + 1; + } else { + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; + } +#ifdef CONFIG_X86_32 + WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) + < machine_to_phys_mapping); +#endif +} + +#ifdef CONFIG_X86_64 +static void __init convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for (i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); + clear_page((void *)addr); + (*pt_end)--; + } +} +/* + * Set up the initial kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, and level2_kernel_pgt. This means that only the + * kernel has a physical mapping to start with - but that's enough to + * get __va working. We need to fill in the rest of the physical + * mapping once some sort of allocator has been set up. + */ +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; + + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ + convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ + convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_k[511] -> level2_fixmap_pgt */ + convert_pfn_mfn(level3_kernel_pgt); + + /* L3_k[511][506] -> level1_fixmap_pgt */ + convert_pfn_mfn(level2_fixmap_pgt); + } + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][510] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ + copy_page(level2_ident_pgt, l2); + /* Graft it onto L4[511][510] */ + copy_page(level2_kernel_pgt, l2); + + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } else + native_write_cr3(__pa(init_level4_pgt)); + + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); + + /* Our (by three pages) smaller Xen pagetable that we are using */ + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); + + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); +} + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + p4d_t *p4d = NULL; + pgd_t *pgd; + unsigned long *new_p2m; + int save_pud; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; + if (PTRS_PER_P4D > 1) + n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + else + n_p4d = 0; + n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + p4d_phys = new_area; + pud_phys = p4d_phys + PFN_PHYS(n_p4d); + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + idx_p4d = 0; + save_pud = n_pud; + do { + if (n_p4d > 0) { + p4d = early_memremap(p4d_phys, PAGE_SIZE); + clear_page(p4d); + n_pud = min(save_pud, PTRS_PER_P4D); + } + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + if (n_p4d > 0) + set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); + else + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + if (n_p4d > 0) { + save_pud -= PTRS_PER_P4D; + early_memunmap(p4d, PAGE_SIZE); + make_lowmem_page_readonly(__va(p4d_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); + set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); + p4d_phys += PAGE_SIZE; + } + } while (++idx_p4d < n_p4d); + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + +#else /* !CONFIG_X86_64 */ +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); + +static void __init xen_write_cr3_init(unsigned long cr3) +{ + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); + + BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(cr3 != __pa(swapper_pg_dir)); + + /* + * We are switching to swapper_pg_dir for the first time (from + * initial_page_table) and therefore need to mark that page + * read-only and then pin it. + * + * Xen disallows sharing of kernel PMDs for PAE + * guests. Therefore we must copy the kernel PMD from + * initial_page_table into a new kernel PMD to be used in + * swapper_pg_dir. + */ + swapper_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + xen_write_cr3(cr3); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(initial_page_table))); + set_page_prot(initial_page_table, PAGE_KERNEL); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} + +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + + initial_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); + + copy_page(initial_kernel_pmd, kernel_pmd); + + xen_map_identity_early(initial_kernel_pmd, max_pfn); + + copy_page(initial_page_table, pgd); + initial_page_table[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); + + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); + set_page_prot(initial_page_table, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa(initial_page_table))); + xen_write_cr3(__pa(initial_page_table)); + + memblock_reserve(xen_pt_base, xen_pt_size); +} +#endif /* CONFIG_X86_64 */ + +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + +static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; + +static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: + case FIX_RO_IDT: +#ifdef CONFIG_X86_32 + case FIX_WP_TEST: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#elif defined(CONFIG_X86_VSYSCALL_EMULATION) + case VSYSCALL_PAGE: +#endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: + case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; + +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + +#ifdef CONFIG_X86_IO_APIC + case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: + /* + * We just don't map the IO APIC - all access is via + * hypercalls. Keep the address in the pte for reference. + */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + + case FIX_PARAVIRT_BOOTMAP: + /* This is an MFN, but it isn't an IO mapping from the + IO domain */ + pte = mfn_pte(phys, prot); + break; + + default: + /* By default, set_fixmap is used for hardware mappings */ + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_VSYSCALL_EMULATION + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx == VSYSCALL_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif +} + +static void __init xen_post_allocator_init(void) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.set_p4d = xen_set_p4d; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); + +#ifdef CONFIG_X86_64 + pv_mmu_ops.write_cr3 = &xen_write_cr3; + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif + xen_mark_init_mm_pinned(); +} + +static void xen_leave_lazy_mmu(void) +{ + preempt_disable(); + xen_mc_flush(); + paravirt_leave_lazy_mmu(); + preempt_enable(); +} + +static const struct pv_mmu_ops xen_mmu_ops __initconst = { + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, + + .set_pte = xen_set_pte_init, + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, + + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + +#if CONFIG_PGTABLE_LEVELS >= 4 + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, + + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, + + .set_fixmap = xen_set_fixmap, +}; + +void __init xen_init_mmu_ops(void) +{ + x86_init.paging.pagetable_init = xen_pagetable_init; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + pv_mmu_ops = xen_mmu_ops; + + memset(dummy_mapping, 0xff, PAGE_SIZE); +} + +/* Protected by xen_reservation_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; + +#define VOID_PTE (mfn_pte(0, __pgprot(0))) +static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, + unsigned long *in_frames, + unsigned long *out_frames) +{ + int i; + struct multicall_space mcs; + + xen_mc_batch(); + for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { + mcs = __xen_mc_entry(0); + + if (in_frames) + in_frames[i] = virt_to_mfn(vaddr); + + MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + + if (out_frames) + out_frames[i] = virt_to_pfn(vaddr); + } + xen_mc_issue(0); +} + +/* + * Update the pfn-to-mfn mappings for a virtual address range, either to + * point to an array of mfns, or contiguously from a single starting + * mfn. + */ +static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, + unsigned long *mfns, + unsigned long first_mfn) +{ + unsigned i, limit; + unsigned long mfn; + + xen_mc_batch(); + + limit = 1u << order; + for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { + struct multicall_space mcs; + unsigned flags; + + mcs = __xen_mc_entry(0); + if (mfns) + mfn = mfns[i]; + else + mfn = first_mfn + i; + + if (i < (limit - 1)) + flags = 0; + else { + if (order == 0) + flags = UVMF_INVLPG | UVMF_ALL; + else + flags = UVMF_TLB_FLUSH | UVMF_ALL; + } + + MULTI_update_va_mapping(mcs.mc, vaddr, + mfn_pte(mfn, PAGE_KERNEL), flags); + + set_phys_to_machine(virt_to_pfn(vaddr), mfn); + } + + xen_mc_issue(0); +} + +/* + * Perform the hypercall to exchange a region of our pfns to point to + * memory with the required contiguous alignment. Takes the pfns as + * input, and populates mfns as output. + * + * Returns a success code indicating whether the hypervisor was able to + * satisfy the request or not. + */ +static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, + unsigned long *pfns_in, + unsigned long extents_out, + unsigned int order_out, + unsigned long *mfns_out, + unsigned int address_bits) +{ + long rc; + int success; + + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = extents_in, + .extent_order = order_in, + .extent_start = pfns_in, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = extents_out, + .extent_order = order_out, + .extent_start = mfns_out, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + BUG_ON(extents_in << order_in != extents_out << order_out); + + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == extents_in); + + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); + + return success; +} + +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) +{ + unsigned long *in_frames = discontig_frames, out_frame; + unsigned long flags; + int success; + unsigned long vstart = (unsigned long)phys_to_virt(pstart); + + /* + * Currently an auto-translated guest will not perform I/O, nor will + * it require PAE page directories below 4GB. Therefore any calls to + * this function are redundant and can be ignored. + */ + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + xen_zap_pfn_range(vstart, order, in_frames, NULL); + + /* 2. Get a new contiguous memory extent. */ + out_frame = virt_to_pfn(vstart); + success = xen_exchange_memory(1UL << order, 0, in_frames, + 1, order, &out_frame, + address_bits); + + /* 3. Map the new extent in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); + else + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + *dma_handle = virt_to_machine(vstart).maddr; + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long flags; + int success; + unsigned long vstart; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + vstart = (unsigned long)phys_to_virt(pstart); + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = virt_to_mfn(vstart); + + /* 2. Zap current PTEs. */ + xen_zap_pfn_range(vstart, order, NULL, out_frames); + + /* 3. Do the exchange for non-contiguous MFNs. */ + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, + 0, out_frames, 0); + + /* 4. Map new pages in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); + else + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +#ifdef CONFIG_KEXEC_CORE +phys_addr_t paddr_vmcoreinfo_note(void) +{ + if (xen_pv_domain()) + return virt_to_machine(&vmcoreinfo_note).maddr; + else + return __pa_symbol(&vmcoreinfo_note); +} +#endif /* CONFIG_KEXEC_CORE */ diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h index af5f0ad94078..4be5355b56f7 100644 --- a/arch/x86/xen/pmu.h +++ b/arch/x86/xen/pmu.h @@ -4,8 +4,13 @@ #include <xen/interface/xenpmu.h> irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +#ifdef CONFIG_XEN_HAVE_VPMU void xen_pmu_init(int cpu); void xen_pmu_finish(int cpu); +#else +static inline void xen_pmu_init(int cpu) {} +static inline void xen_pmu_finish(int cpu) {} +#endif bool is_xen_pmu(int cpu); bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a8c306cf8868..a5bf7c451435 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -14,7 +14,7 @@ #include <asm/elf.h> #include <asm/vdso.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/setup.h> #include <asm/acpi.h> #include <asm/numa.h> @@ -41,8 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820_X_MAX] __initdata; -static u32 xen_e820_map_entries __initdata; +static struct e820_table xen_e820_table __initdata; /* * Buffer used to remap identity mapped pages. We only need the virtual space. @@ -198,15 +197,15 @@ void __init xen_inv_extra_mem(void) */ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry = xen_e820_map; + const struct e820_entry *entry = xen_e820_table.entries; unsigned int i; unsigned long done = 0; - for (i = 0; i < xen_e820_map_entries; i++, entry++) { + for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) continue; e_pfn = PFN_DOWN(entry->addr + entry->size); @@ -457,7 +456,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, { phys_addr_t start = 0; unsigned long ret_val = 0; - const struct e820entry *entry = xen_e820_map; + const struct e820_entry *entry = xen_e820_table.entries; int i; /* @@ -471,13 +470,13 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0; i < xen_e820_map_entries; i++, entry++) { + for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { + if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); - if (entry->type == E820_RAM) + if (entry->type == E820_TYPE_RAM) end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) @@ -591,28 +590,28 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, phys_addr_t end = start + size; /* Align RAM regions to page boundaries. */ - if (type == E820_RAM) { + if (type == E820_TYPE_RAM) { start = PAGE_ALIGN(start); end &= ~((phys_addr_t)PAGE_SIZE - 1); } - e820_add_region(start, end - start, type); + e820__range_add(start, end - start, type); } static void __init xen_ignore_unusable(void) { - struct e820entry *entry = xen_e820_map; + struct e820_entry *entry = xen_e820_table.entries; unsigned int i; - for (i = 0; i < xen_e820_map_entries; i++, entry++) { - if (entry->type == E820_UNUSABLE) - entry->type = E820_RAM; + for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) { + if (entry->type == E820_TYPE_UNUSABLE) + entry->type = E820_TYPE_RAM; } } bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { - struct e820entry *entry; + struct e820_entry *entry; unsigned mapcnt; phys_addr_t end; @@ -620,10 +619,10 @@ bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) return false; end = start + size; - entry = xen_e820_map; + entry = xen_e820_table.entries; - for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { - if (entry->type == E820_RAM && entry->addr <= start && + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->type == E820_TYPE_RAM && entry->addr <= start && (entry->addr + entry->size) >= end) return false; @@ -645,10 +644,10 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) { unsigned mapcnt; phys_addr_t addr, start; - struct e820entry *entry = xen_e820_map; + struct e820_entry *entry = xen_e820_table.entries; - for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { - if (entry->type != E820_RAM || entry->size < size) + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) { + if (entry->type != E820_TYPE_RAM || entry->size < size) continue; start = entry->addr; for (addr = start; addr < start + size; addr += PAGE_SIZE) { @@ -750,8 +749,8 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = ARRAY_SIZE(xen_e820_map); - set_xen_guest_handle(memmap.buffer, xen_e820_map); + memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -760,16 +759,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - xen_e820_map[0].addr = 0ULL; - xen_e820_map[0].size = mem_end; + xen_e820_table.entries[0].addr = 0ULL; + xen_e820_table.entries[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - xen_e820_map[0].size += 8ULL << 20; - xen_e820_map[0].type = E820_RAM; + xen_e820_table.entries[0].size += 8ULL << 20; + xen_e820_table.entries[0].type = E820_TYPE_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); - xen_e820_map_entries = memmap.nr_entries; + xen_e820_table.nr_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -783,8 +782,7 @@ char * __init xen_memory_setup(void) xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), - &xen_e820_map_entries); + e820__update_table(&xen_e820_table); max_pages = xen_get_max_pages(); @@ -811,15 +809,15 @@ char * __init xen_memory_setup(void) extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; - addr = xen_e820_map[0].addr; - size = xen_e820_map[0].size; - while (i < xen_e820_map_entries) { + addr = xen_e820_table.entries[0].addr; + size = xen_e820_table.entries[0].size; + while (i < xen_e820_table.nr_entries) { bool discard = false; chunk_size = size; - type = xen_e820_map[i].type; + type = xen_e820_table.entries[i].type; - if (type == E820_RAM) { + if (type == E820_TYPE_RAM) { if (addr < mem_end) { chunk_size = min(size, mem_end - addr); } else if (extra_pages) { @@ -840,9 +838,9 @@ char * __init xen_memory_setup(void) size -= chunk_size; if (size == 0) { i++; - if (i < xen_e820_map_entries) { - addr = xen_e820_map[i].addr; - size = xen_e820_map[i].size; + if (i < xen_e820_table.nr_entries) { + addr = xen_e820_table.entries[i].addr; + size = xen_e820_table.entries[i].size; } } } @@ -858,10 +856,9 @@ char * __init xen_memory_setup(void) * reserve ISA memory anyway because too many things poke * about in there. */ - e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, - E820_RESERVED); + e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); /* * Check whether the kernel itself conflicts with the target E820 map. @@ -915,6 +912,37 @@ char * __init xen_memory_setup(void) } /* + * Machine specific memory setup for auto-translated guests. + */ +char * __init xen_auto_xlated_memory_setup(void) +{ + struct xen_memory_map memmap; + int i; + int rc; + + memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc < 0) + panic("No memory map (%d)\n", rc); + + xen_e820_table.nr_entries = memmap.nr_entries; + + e820__update_table(&xen_e820_table); + + for (i = 0; i < xen_e820_table.nr_entries; i++) + e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type); + + /* Remove p2m info, it is not needed. */ + xen_start_info->mfn_list = 0; + xen_start_info->first_p2m_pfn = 0; + xen_start_info->nr_p2m_frames = 0; + + return "Xen"; +} + +/* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes * can have un-truncated segments, so wrapping around is allowed. @@ -999,8 +1027,8 @@ void __init xen_pvmmu_arch_setup(void) void __init xen_arch_setup(void) { xen_panic_handler_init(); - - xen_pvmmu_arch_setup(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7ff2f1bfb7ec..82ac611f2fc1 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -1,63 +1,21 @@ -/* - * Xen SMP support - * - * This file implements the Xen versions of smp_ops. SMP under Xen is - * very straightforward. Bringing a CPU up is simply a matter of - * loading its initial context and setting it running. - * - * IPIs are handled through the Xen event mechanism. - * - * Because virtual CPUs can be scheduled onto any real CPU, there's no - * useful topology information for the kernel to make use of. As a - * result, all CPUs are treated as if they're single-core and - * single-threaded. - */ -#include <linux/sched.h> -#include <linux/err.h> -#include <linux/slab.h> #include <linux/smp.h> -#include <linux/irq_work.h> -#include <linux/tick.h> -#include <linux/nmi.h> - -#include <asm/paravirt.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/cpu.h> - -#include <xen/interface/xen.h> -#include <xen/interface/vcpu.h> -#include <xen/interface/xenpmu.h> - -#include <asm/xen/interface.h> -#include <asm/xen/hypercall.h> +#include <linux/slab.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> -#include <xen/xen.h> -#include <xen/page.h> #include <xen/events.h> #include <xen/hvc-console.h> #include "xen-ops.h" -#include "mmu.h" #include "smp.h" -#include "pmu.h" - -cpumask_var_t xen_cpu_initialized_map; -struct xen_common_irq { - int irq; - char *name; -}; static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; -static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; -static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); -static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); /* * Reschedule call back. @@ -70,42 +28,6 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void cpu_bringup(void) -{ - int cpu; - - cpu_init(); - touch_softlockup_watchdog(); - preempt_disable(); - - /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ - if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { - xen_enable_sysenter(); - xen_enable_syscall(); - } - cpu = smp_processor_id(); - smp_store_cpu_info(cpu); - cpu_data(cpu).x86_max_cores = 1; - set_cpu_sibling_map(cpu); - - xen_setup_cpu_clockevents(); - - notify_cpu_starting(cpu); - - set_cpu_online(cpu, true); - - cpu_set_state_online(cpu); /* Implies full memory barrier. */ - - /* We can take interrupts now: we're officially "up". */ - local_irq_enable(); -} - -asmlinkage __visible void cpu_bringup_and_idle(void) -{ - cpu_bringup(); - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); -} - void xen_smp_intr_free(unsigned int cpu) { if (per_cpu(xen_resched_irq, cpu).irq >= 0) { @@ -133,27 +55,12 @@ void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } - if (xen_hvm_domain()) - return; - - if (per_cpu(xen_irq_work, cpu).irq >= 0) { - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); - per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; - } +} - if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { - unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); - per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; - } -}; int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name, *pmu_name; + char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -200,37 +107,6 @@ int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; - /* - * The IRQ worker on PVHVM goes through the native path and uses the - * IPI mechanism. - */ - if (xen_hvm_domain()) - return 0; - - callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); - rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, - cpu, - xen_irq_work_interrupt, - IRQF_PERCPU|IRQF_NOBALANCING, - callfunc_name, - NULL); - if (rc < 0) - goto fail; - per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; - - if (is_xen_pmu(cpu)) { - pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); - rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, - xen_pmu_irq_handler, - IRQF_PERCPU|IRQF_NOBALANCING, - pmu_name, NULL); - if (rc < 0) - goto fail; - per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; - } - return 0; fail: @@ -238,333 +114,7 @@ int xen_smp_intr_init(unsigned int cpu) return rc; } -static void __init xen_fill_possible_map(void) -{ - int i, rc; - - if (xen_initial_domain()) - return; - - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } - } -} - -static void __init xen_filter_cpu_maps(void) -{ - int i, rc; - unsigned int subtract = 0; - - if (!xen_initial_domain()) - return; - - num_processors = 0; - disabled_cpus = 0; - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } else { - set_cpu_possible(i, false); - set_cpu_present(i, false); - subtract++; - } - } -#ifdef CONFIG_HOTPLUG_CPU - /* This is akin to using 'nr_cpus' on the Linux command line. - * Which is OK as when we use 'dom0_max_vcpus=X' we can only - * have up to X, while nr_cpu_ids is greater than X. This - * normally is not a problem, except when CPU hotplugging - * is involved and then there might be more than X CPUs - * in the guest - which will not work as there is no - * hypercall to expand the max number of VCPUs an already - * running guest has. So cap it up to X. */ - if (subtract) - nr_cpu_ids = nr_cpu_ids - subtract; -#endif - -} - -static void __init xen_smp_prepare_boot_cpu(void) -{ - BUG_ON(smp_processor_id() != 0); - native_smp_prepare_boot_cpu(); - - if (xen_pv_domain()) { - if (!xen_feature(XENFEAT_writable_page_tables)) - /* We've switched to the "real" per-cpu gdt, so make - * sure the old memory can be recycled. */ - make_lowmem_page_readwrite(xen_initial_gdt); - -#ifdef CONFIG_X86_32 - /* - * Xen starts us with XEN_FLAT_RING1_DS, but linux code - * expects __USER_DS - */ - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); -#endif - - xen_filter_cpu_maps(); - xen_setup_vcpu_info_placement(); - } - - /* - * Setup vcpu_info for boot CPU. - */ - if (xen_hvm_domain()) - xen_vcpu_setup(0); - - /* - * The alternative logic (which patches the unlock/lock) runs before - * the smp bootup up code is activated. Hence we need to set this up - * the core kernel is being patched. Otherwise we will have only - * modules patched but not core code. - */ - xen_init_spinlocks(); -} - -static void __init xen_smp_prepare_cpus(unsigned int max_cpus) -{ - unsigned cpu; - unsigned int i; - - if (skip_ioapic_setup) { - char *m = (max_cpus == 0) ? - "The nosmp parameter is incompatible with Xen; " \ - "use Xen dom0_max_vcpus=1 parameter" : - "The noapic parameter is incompatible with Xen"; - - xen_raw_printk(m); - panic(m); - } - xen_init_lock_cpu(0); - - smp_store_boot_cpu_info(); - cpu_data(0).x86_max_cores = 1; - - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - } - set_cpu_sibling_map(0); - - xen_pmu_init(0); - - if (xen_smp_intr_init(0)) - BUG(); - - if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) - panic("could not allocate xen_cpu_initialized_map\n"); - - cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); - - /* Restrict the possible_map according to max_cpus. */ - while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) - continue; - set_cpu_possible(cpu, false); - } - - for_each_possible_cpu(cpu) - set_cpu_present(cpu, true); -} - -static int -cpu_initialize_context(unsigned int cpu, struct task_struct *idle) -{ - struct vcpu_guest_context *ctxt; - struct desc_struct *gdt; - unsigned long gdt_mfn; - - /* used to tell cpu_init() that it can proceed with initialization */ - cpumask_set_cpu(cpu, cpu_callout_mask); - if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) - return 0; - - ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt == NULL) - return -ENOMEM; - - gdt = get_cpu_gdt_table(cpu); - -#ifdef CONFIG_X86_32 - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = __KERNEL_STACK_CANARY; -#endif - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; - - xen_copy_trap_info(ctxt->trap_ctxt); - - ctxt->ldt_ents = 0; - - BUG_ON((unsigned long)gdt & ~PAGE_MASK); - - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; - - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; - -#ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; -#else - ctxt->gs_base_kernel = per_cpu_offset(cpu); -#endif - ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) - BUG(); - - kfree(ctxt); - return 0; -} - -static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) -{ - int rc; - - common_cpu_up(cpu, idle); - - xen_setup_runstate_info(cpu); - - /* - * PV VCPUs are always successfully taken down (see 'while' loop - * in xen_cpu_die()), so -EBUSY is an error. - */ - rc = cpu_check_up_prepare(cpu); - if (rc) - return rc; - - /* make sure interrupts start blocked */ - per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; - - rc = cpu_initialize_context(cpu, idle); - if (rc) - return rc; - - xen_pmu_init(cpu); - - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); - BUG_ON(rc); - - while (cpu_report_state(cpu) != CPU_ONLINE) - HYPERVISOR_sched_op(SCHEDOP_yield, NULL); - - return 0; -} - -static void xen_smp_cpus_done(unsigned int max_cpus) -{ -} - -#ifdef CONFIG_HOTPLUG_CPU -static int xen_cpu_disable(void) -{ - unsigned int cpu = smp_processor_id(); - if (cpu == 0) - return -EBUSY; - - cpu_disable_common(); - - load_cr3(swapper_pg_dir); - return 0; -} - -static void xen_cpu_die(unsigned int cpu) -{ - while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, - xen_vcpu_nr(cpu), NULL)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ/10); - } - - if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - xen_pmu_finish(cpu); - } -} - -static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ -{ - play_dead_common(); - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); - cpu_bringup(); - /* - * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) - * clears certain data that the cpu_idle loop (which called us - * and that we return from) expects. The only way to get that - * data back is to call: - */ - tick_nohz_idle_enter(); - - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); -} - -#else /* !CONFIG_HOTPLUG_CPU */ -static int xen_cpu_disable(void) -{ - return -ENOSYS; -} - -static void xen_cpu_die(unsigned int cpu) -{ - BUG(); -} - -static void xen_play_dead(void) -{ - BUG(); -} - -#endif -static void stop_self(void *v) -{ - int cpu = smp_processor_id(); - - /* make sure we're not pinning something down */ - load_cr3(swapper_pg_dir); - /* should set up a minimal gdt */ - - set_cpu_online(cpu, false); - - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); - BUG(); -} - -static void xen_stop_other_cpus(int wait) -{ - smp_call_function(stop_self, NULL, wait); -} - -static void xen_smp_send_reschedule(int cpu) +void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } @@ -578,7 +128,7 @@ static void __xen_send_IPI_mask(const struct cpumask *mask, xen_send_IPI_one(cpu, vector); } -static void xen_smp_send_call_function_ipi(const struct cpumask *mask) +void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; @@ -593,7 +143,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) } } -static void xen_smp_send_call_function_single_ipi(int cpu) +void xen_smp_send_call_function_single_ipi(int cpu) { __xen_send_IPI_mask(cpumask_of(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); @@ -698,54 +248,3 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } - -static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) -{ - irq_enter(); - irq_work_run(); - inc_irq_stat(apic_irq_work_irqs); - irq_exit(); - - return IRQ_HANDLED; -} - -static const struct smp_ops xen_smp_ops __initconst = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .smp_cpus_done = xen_smp_cpus_done, - - .cpu_up = xen_cpu_up, - .cpu_die = xen_cpu_die, - .cpu_disable = xen_cpu_disable, - .play_dead = xen_play_dead, - - .stop_other_cpus = xen_stop_other_cpus, - .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, -}; - -void __init xen_smp_init(void) -{ - smp_ops = xen_smp_ops; - xen_fill_possible_map(); -} - -static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) -{ - native_smp_prepare_cpus(max_cpus); - WARN_ON(xen_smp_intr_init(0)); - - xen_init_lock_cpu(0); -} - -void __init xen_hvm_smp_init(void) -{ - smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; - smp_ops.smp_send_reschedule = xen_smp_send_reschedule; - smp_ops.cpu_die = xen_cpu_die; - smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; - smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; - smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; -} diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 9beef333584a..8ebb6acca64a 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -11,7 +11,17 @@ extern void xen_send_IPI_self(int vector); extern int xen_smp_intr_init(unsigned int cpu); extern void xen_smp_intr_free(unsigned int cpu); +int xen_smp_intr_init_pv(unsigned int cpu); +void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_send_reschedule(int cpu); +void xen_smp_send_call_function_ipi(const struct cpumask *mask); +void xen_smp_send_call_function_single_ipi(int cpu); + +struct xen_common_irq { + int irq; + char *name; +}; #else /* CONFIG_SMP */ static inline int xen_smp_intr_init(unsigned int cpu) @@ -19,6 +29,12 @@ static inline int xen_smp_intr_init(unsigned int cpu) return 0; } static inline void xen_smp_intr_free(unsigned int cpu) {} + +static inline int xen_smp_intr_init_pv(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free_pv(unsigned int cpu) {} #endif /* CONFIG_SMP */ #endif diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c new file mode 100644 index 000000000000..f18561bbf5c9 --- /dev/null +++ b/arch/x86/xen/smp_hvm.c @@ -0,0 +1,63 @@ +#include <asm/smp.h> + +#include <xen/events.h> + +#include "xen-ops.h" +#include "smp.h" + + +static void __init xen_hvm_smp_prepare_boot_cpu(void) +{ + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + /* + * Setup vcpu_info for boot CPU. + */ + xen_vcpu_setup(0); + + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); +} + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + xen_init_lock_cpu(0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xen_hvm_cpu_die(unsigned int cpu) +{ + if (common_cpu_die(cpu) == 0) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + } +} +#else +static void xen_hvm_cpu_die(unsigned int cpu) +{ + BUG(); +} +#endif + +void __init xen_hvm_smp_init(void) +{ + if (!xen_have_vector_callback) + return; + + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; +} diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c new file mode 100644 index 000000000000..aae32535f4ec --- /dev/null +++ b/arch/x86/xen/smp_pv.c @@ -0,0 +1,490 @@ +/* + * Xen SMP support + * + * This file implements the Xen versions of smp_ops. SMP under Xen is + * very straightforward. Bringing a CPU up is simply a matter of + * loading its initial context and setting it running. + * + * IPIs are handled through the Xen event mechanism. + * + * Because virtual CPUs can be scheduled onto any real CPU, there's no + * useful topology information for the kernel to make use of. As a + * result, all CPUs are treated as if they're single-core and + * single-threaded. + */ +#include <linux/sched.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/irq_work.h> +#include <linux/tick.h> +#include <linux/nmi.h> + +#include <asm/paravirt.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/cpu.h> + +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/events.h> + +#include <xen/hvc-console.h> +#include "xen-ops.h" +#include "mmu.h" +#include "smp.h" +#include "pmu.h" + +cpumask_var_t xen_cpu_initialized_map; + +static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; + +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); + +static void cpu_bringup(void) +{ + int cpu; + + cpu_init(); + touch_softlockup_watchdog(); + preempt_disable(); + + /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { + xen_enable_sysenter(); + xen_enable_syscall(); + } + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); + + xen_setup_cpu_clockevents(); + + notify_cpu_starting(cpu); + + set_cpu_online(cpu, true); + + cpu_set_state_online(cpu); /* Implies full memory barrier. */ + + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); +} + +asmlinkage __visible void cpu_bringup_and_idle(void) +{ + cpu_bringup(); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +void xen_smp_intr_free_pv(unsigned int cpu) +{ + if (per_cpu(xen_irq_work, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); + per_cpu(xen_irq_work, cpu).irq = -1; + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; + } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } +} + +int xen_smp_intr_init_pv(unsigned int cpu) +{ + int rc; + char *callfunc_name, *pmu_name; + + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, + cpu, + xen_irq_work_interrupt, + IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_irq_work, cpu).irq = rc; + per_cpu(xen_irq_work, cpu).name = callfunc_name; + + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + + return 0; + + fail: + xen_smp_intr_free_pv(cpu); + return rc; +} + +static void __init xen_fill_possible_map(void) +{ + int i, rc; + + if (xen_initial_domain()) + return; + + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } + } +} + +static void __init xen_filter_cpu_maps(void) +{ + int i, rc; + unsigned int subtract = 0; + + if (!xen_initial_domain()) + return; + + num_processors = 0; + disabled_cpus = 0; + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } else { + set_cpu_possible(i, false); + set_cpu_present(i, false); + subtract++; + } + } +#ifdef CONFIG_HOTPLUG_CPU + /* This is akin to using 'nr_cpus' on the Linux command line. + * Which is OK as when we use 'dom0_max_vcpus=X' we can only + * have up to X, while nr_cpu_ids is greater than X. This + * normally is not a problem, except when CPU hotplugging + * is involved and then there might be more than X CPUs + * in the guest - which will not work as there is no + * hypercall to expand the max number of VCPUs an already + * running guest has. So cap it up to X. */ + if (subtract) + nr_cpu_ids = nr_cpu_ids - subtract; +#endif + +} + +static void __init xen_pv_smp_prepare_boot_cpu(void) +{ + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + if (!xen_feature(XENFEAT_writable_page_tables)) + /* We've switched to the "real" per-cpu gdt, so make + * sure the old memory can be recycled. */ + make_lowmem_page_readwrite(xen_initial_gdt); + +#ifdef CONFIG_X86_32 + /* + * Xen starts us with XEN_FLAT_RING1_DS, but linux code + * expects __USER_DS + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + + xen_filter_cpu_maps(); + xen_setup_vcpu_info_placement(); + + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); +} + +static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned cpu; + unsigned int i; + + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } + xen_init_lock_cpu(0); + + smp_store_boot_cpu_info(); + cpu_data(0).x86_max_cores = 1; + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + } + set_cpu_sibling_map(0); + + xen_pmu_init(0); + + if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) + BUG(); + + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) + panic("could not allocate xen_cpu_initialized_map\n"); + + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); + + /* Restrict the possible_map according to max_cpus. */ + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) + continue; + set_cpu_possible(cpu, false); + } + + for_each_possible_cpu(cpu) + set_cpu_present(cpu, true); +} + +static int +cpu_initialize_context(unsigned int cpu, struct task_struct *idle) +{ + struct vcpu_guest_context *ctxt; + struct desc_struct *gdt; + unsigned long gdt_mfn; + + /* used to tell cpu_init() that it can proceed with initialization */ + cpumask_set_cpu(cpu, cpu_callout_mask); + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) + return 0; + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt == NULL) + return -ENOMEM; + + gdt = get_cpu_gdt_rw(cpu); + +#ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = __KERNEL_STACK_CANARY; +#endif + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; + + xen_copy_trap_info(ctxt->trap_ctxt); + + ctxt->ldt_ents = 0; + + BUG_ON((unsigned long)gdt & ~PAGE_MASK); + + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; + + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; + +#ifdef CONFIG_X86_32 + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); +#endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) + BUG(); + + kfree(ctxt); + return 0; +} + +static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + int rc; + + common_cpu_up(cpu, idle); + + xen_setup_runstate_info(cpu); + + /* + * PV VCPUs are always successfully taken down (see 'while' loop + * in xen_cpu_die()), so -EBUSY is an error. + */ + rc = cpu_check_up_prepare(cpu); + if (rc) + return rc; + + /* make sure interrupts start blocked */ + per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; + + rc = cpu_initialize_context(cpu, idle); + if (rc) + return rc; + + xen_pmu_init(cpu); + + rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); + BUG_ON(rc); + + while (cpu_report_state(cpu) != CPU_ONLINE) + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + + return 0; +} + +static void xen_pv_smp_cpus_done(unsigned int max_cpus) +{ +} + +#ifdef CONFIG_HOTPLUG_CPU +static int xen_pv_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +static void xen_pv_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ/10); + } + + if (common_cpu_die(cpu) == 0) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + xen_pmu_finish(cpu); + } +} + +static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); + cpu_bringup(); + /* + * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) + * clears certain data that the cpu_idle loop (which called us + * and that we return from) expects. The only way to get that + * data back is to call: + */ + tick_nohz_idle_enter(); + + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static int xen_pv_cpu_disable(void) +{ + return -ENOSYS; +} + +static void xen_pv_cpu_die(unsigned int cpu) +{ + BUG(); +} + +static void xen_pv_play_dead(void) +{ + BUG(); +} + +#endif +static void stop_self(void *v) +{ + int cpu = smp_processor_id(); + + /* make sure we're not pinning something down */ + load_cr3(swapper_pg_dir); + /* should set up a minimal gdt */ + + set_cpu_online(cpu, false); + + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); + BUG(); +} + +static void xen_pv_stop_other_cpus(int wait) +{ + smp_call_function(stop_self, NULL, wait); +} + +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) +{ + irq_enter(); + irq_work_run(); + inc_irq_stat(apic_irq_work_irqs); + irq_exit(); + + return IRQ_HANDLED; +} + +static const struct smp_ops xen_smp_ops __initconst = { + .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_pv_smp_prepare_cpus, + .smp_cpus_done = xen_pv_smp_cpus_done, + + .cpu_up = xen_pv_cpu_up, + .cpu_die = xen_pv_cpu_die, + .cpu_disable = xen_pv_cpu_disable, + .play_dead = xen_pv_play_dead, + + .stop_other_cpus = xen_pv_stop_other_cpus, + .smp_send_reschedule = xen_smp_send_reschedule, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; + +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; + xen_fill_possible_map(); +} diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7f664c416faf..d6b1680693a9 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -14,60 +14,6 @@ #include "mmu.h" #include "pmu.h" -static void xen_pv_pre_suspend(void) -{ - xen_mm_pin_all(); - - xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = - mfn_to_pfn(xen_start_info->console.domU.mfn); - - BUG_ON(!irqs_disabled()); - - HYPERVISOR_shared_info = &xen_dummy_shared_info; - if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), - __pte_ma(0), 0)) - BUG(); -} - -static void xen_hvm_post_suspend(int suspend_cancelled) -{ -#ifdef CONFIG_XEN_PVHVM - int cpu; - if (!suspend_cancelled) - xen_hvm_init_shared_info(); - xen_callback_vector(); - xen_unplug_emulated_devices(); - if (xen_feature(XENFEAT_hvm_safe_pvclock)) { - for_each_online_cpu(cpu) { - xen_setup_runstate_info(cpu); - } - } -#endif -} - -static void xen_pv_post_suspend(int suspend_cancelled) -{ - xen_build_mfn_list_list(); - - xen_setup_shared_info(); - - if (suspend_cancelled) { - xen_start_info->store_mfn = - pfn_to_mfn(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = - pfn_to_mfn(xen_start_info->console.domU.mfn); - } else { -#ifdef CONFIG_SMP - BUG_ON(xen_cpu_initialized_map == NULL); - cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); -#endif - xen_vcpu_restore(); - } - - xen_mm_unpin_all(); -} - void xen_arch_pre_suspend(void) { if (xen_pv_domain()) diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c new file mode 100644 index 000000000000..01afcadde50a --- /dev/null +++ b/arch/x86/xen/suspend_hvm.c @@ -0,0 +1,22 @@ +#include <linux/types.h> + +#include <xen/xen.h> +#include <xen/features.h> +#include <xen/interface/features.h> + +#include "xen-ops.h" + +void xen_hvm_post_suspend(int suspend_cancelled) +{ + int cpu; + + if (!suspend_cancelled) + xen_hvm_init_shared_info(); + xen_callback_vector(); + xen_unplug_emulated_devices(); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { + for_each_online_cpu(cpu) { + xen_setup_runstate_info(cpu); + } + } +} diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c new file mode 100644 index 000000000000..3abe4f07f34a --- /dev/null +++ b/arch/x86/xen/suspend_pv.c @@ -0,0 +1,46 @@ +#include <linux/types.h> + +#include <asm/fixmap.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +#include "xen-ops.h" + +void xen_pv_pre_suspend(void) +{ + xen_mm_pin_all(); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); + + BUG_ON(!irqs_disabled()); + + HYPERVISOR_shared_info = &xen_dummy_shared_info; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + __pte_ma(0), 0)) + BUG(); +} + +void xen_pv_post_suspend(int suspend_cancelled) +{ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); + + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + BUG_ON(xen_cpu_initialized_map == NULL); + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); +#endif + xen_vcpu_restore(); + } + + xen_mm_unpin_all(); +} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 1e69956d7852..090c7eb4dca9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -209,7 +209,9 @@ static const struct clock_event_device xen_timerop_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, .min_delta_ns = TIMER_SLOP, + .min_delta_ticks = TIMER_SLOP, .mult = 1, .shift = 0, @@ -268,7 +270,9 @@ static const struct clock_event_device xen_vcpuop_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, .min_delta_ns = TIMER_SLOP, + .min_delta_ticks = TIMER_SLOP, .mult = 1, .shift = 0, @@ -432,6 +436,14 @@ static void xen_hvm_setup_cpu_clockevents(void) void __init xen_hvm_init_time_ops(void) { + /* + * vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 and at this point we don't know how many cpus are + * available. + */ + if (!xen_have_vector_callback) + return; + if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { printk(KERN_INFO "Xen doesn't support pvclock on HVM," "disable pv timer\n"); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 37794e42b67d..72a8e6adebe6 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -16,6 +16,7 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> +#ifdef CONFIG_XEN_PV __INIT ENTRY(startup_xen) cld @@ -34,6 +35,7 @@ ENTRY(startup_xen) jmp xen_start_kernel __FINIT +#endif .pushsection .text .balign PAGE_SIZE @@ -58,7 +60,9 @@ ENTRY(hypercall_page) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif +#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) +#endif ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f6a41c41ebc7..9a440a42c618 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -76,6 +76,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); +extern int xen_have_vcpu_info_placement; + void xen_vcpu_setup(int cpu); void xen_setup_vcpu_info_placement(void); @@ -146,4 +148,24 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); +int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), + int (*cpu_dead_cb)(unsigned int)); + +void xen_pin_vcpu(int cpu); + +void xen_emergency_restart(void); +#ifdef CONFIG_XEN_PV +void xen_pv_pre_suspend(void); +void xen_pv_post_suspend(int suspend_cancelled); +#else +static inline void xen_pv_pre_suspend(void) {} +static inline void xen_pv_post_suspend(int suspend_cancelled) {} +#endif + +#ifdef CONFIG_XEN_PVHVM +void xen_hvm_post_suspend(int suspend_cancelled); +#else +static inline void xen_hvm_post_suspend(int suspend_cancelled) {} +#endif + #endif /* XEN_OPS_H */ |