diff options
Diffstat (limited to 'arch')
277 files changed, 6232 insertions, 2223 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index f11f0698b148..c47b328eada0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -781,7 +781,7 @@ config COMPAT_OLD_SIGACTION bool config 64BIT_TIME - def_bool ARCH_HAS_64BIT_TIME + def_bool y help This should be selected by all architectures that need to support new system calls with a 64-bit time_t. This is relevant on all 32-bit diff --git a/arch/alpha/include/asm/segment.h b/arch/alpha/include/asm/segment.h deleted file mode 100644 index 0453d97daae7..000000000000 --- a/arch/alpha/include/asm/segment.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ALPHA_SEGMENT_H -#define __ALPHA_SEGMENT_H - -/* Only here because we have some old header files that expect it.. */ - -#endif diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c index 4dbd4e415041..bbbd34586de0 100644 --- a/arch/alpha/kernel/smc37c669.c +++ b/arch/alpha/kernel/smc37c669.c @@ -10,7 +10,6 @@ #include <asm/hwrpb.h> #include <asm/io.h> -#include <asm/segment.h> #if 0 # define DBG_DEVS(args) printk args diff --git a/arch/alpha/kernel/smc37c93x.c b/arch/alpha/kernel/smc37c93x.c index 733f08966fd2..71cd7aca38ce 100644 --- a/arch/alpha/kernel/smc37c93x.c +++ b/arch/alpha/kernel/smc37c93x.c @@ -11,7 +11,6 @@ #include <asm/hwrpb.h> #include <asm/io.h> -#include <asm/segment.h> #define SMC_DEBUG 0 diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 165f268beafc..9e7704e44f6d 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -467,3 +467,9 @@ 535 common io_uring_setup sys_io_uring_setup 536 common io_uring_enter sys_io_uring_enter 537 common io_uring_register sys_io_uring_register +538 common open_tree sys_open_tree +539 common move_mount sys_move_mount +540 common fsopen sys_fsopen +541 common fsconfig sys_fsconfig +542 common fsmount sys_fsmount +543 common fspick sys_fspick diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index eabc3efa6c6d..526418543379 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -742,6 +742,7 @@ extern long arc_strnlen_user_noinline(const char __user *src, long n); #endif +#include <asm/segment.h> #include <asm-generic/uaccess.h> #endif diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 05ecc004de86..f863c6935d0e 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -116,8 +116,7 @@ endif AFLAGS_NOWARN :=$(call as-option,-Wa$(comma)-mno-warn-deprecated,-Wa$(comma)-W) ifeq ($(CONFIG_THUMB2_KERNEL),y) -AFLAGS_AUTOIT :=$(call as-option,-Wa$(comma)-mimplicit-it=always,-Wa$(comma)-mauto-it) -CFLAGS_ISA :=-mthumb $(AFLAGS_AUTOIT) $(AFLAGS_NOWARN) +CFLAGS_ISA :=-mthumb -Wa,-mimplicit-it=always $(AFLAGS_NOWARN) AFLAGS_ISA :=$(CFLAGS_ISA) -Wa$(comma)-mthumb # Work around buggy relocation from gas if requested: ifeq ($(CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11),y) diff --git a/arch/arm/configs/mini2440_defconfig b/arch/arm/configs/mini2440_defconfig index 8b0f7c4c3f09..7d26ca0b1302 100644 --- a/arch/arm/configs/mini2440_defconfig +++ b/arch/arm/configs/mini2440_defconfig @@ -152,7 +152,7 @@ CONFIG_SPI_S3C24XX=y CONFIG_SPI_SPIDEV=y CONFIG_GPIO_SYSFS=y CONFIG_SENSORS_LM75=y -CONFIG_THERMAL=m +CONFIG_THERMAL=y CONFIG_WATCHDOG=y CONFIG_S3C2410_WATCHDOG=y CONFIG_FB=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index f6d24d762a7f..07ebbdce3645 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -387,7 +387,7 @@ CONFIG_SENSORS_LM75=m CONFIG_SENSORS_LM90=m CONFIG_SENSORS_LM95245=m CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_THERMAL=m +CONFIG_THERMAL=y CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=m CONFIG_SA1100_WATCHDOG=m diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 0b2ecc98e086..60de9d13181a 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -14,7 +14,6 @@ generic-y += msi.h generic-y += parport.h generic-y += preempt.h generic-y += seccomp.h -generic-y += segment.h generic-y += serial.h generic-y += simd.h generic-y += trace_clock.h diff --git a/arch/arm/include/asm/domain.h b/arch/arm/include/asm/domain.h index 99d9f630d6b6..1888c2d15da5 100644 --- a/arch/arm/include/asm/domain.h +++ b/arch/arm/include/asm/domain.h @@ -133,9 +133,11 @@ static inline void modify_domain(unsigned dom, unsigned type) { } * instructions (inline assembly) */ #ifdef CONFIG_CPU_USE_DOMAINS -#define TUSER(instr) #instr "t" +#define TUSER(instr) TUSERCOND(instr, ) +#define TUSERCOND(instr, cond) #instr "t" #cond #else -#define TUSER(instr) #instr +#define TUSER(instr) TUSERCOND(instr, ) +#define TUSERCOND(instr, cond) #instr #cond #endif #else /* __ASSEMBLY__ */ diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 0a46676b4245..83c391b597d4 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -110,10 +110,11 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, preempt_disable(); __ua_flags = uaccess_save_and_enable(); __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" + " .syntax unified\n" "1: " TUSER(ldr) " %1, [%4]\n" " teq %1, %2\n" " it eq @ explicit IT needed for the 2b label\n" - "2: " TUSER(streq) " %3, [%4]\n" + "2: " TUSERCOND(str, eq) " %3, [%4]\n" __futex_atomic_ex_table("%5") : "+r" (ret), "=&r" (val) : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 8927cae7c966..efb0e2c0d84c 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -343,4 +343,6 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, } } +static inline void vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu) {} + #endif /* __ARM_KVM_EMULATE_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 770d73257ad9..075e1921fdd9 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -19,6 +19,7 @@ #ifndef __ARM_KVM_HOST_H__ #define __ARM_KVM_HOST_H__ +#include <linux/errno.h> #include <linux/types.h> #include <linux/kvm_types.h> #include <asm/cputype.h> @@ -53,6 +54,8 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); +static inline int kvm_arm_init_sve(void) { return 0; } + u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -150,9 +153,13 @@ struct kvm_cpu_context { u32 cp15[NR_CP15_REGS]; }; -typedef struct kvm_cpu_context kvm_cpu_context_t; +struct kvm_host_data { + struct kvm_cpu_context host_ctxt; +}; + +typedef struct kvm_host_data kvm_host_data_t; -static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, +static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt, int cpu) { /* The host's MPIDR is immutable, so let's set it up at boot time */ @@ -182,7 +189,7 @@ struct kvm_vcpu_arch { struct kvm_vcpu_fault_info fault; /* Host FP context */ - kvm_cpu_context_t *host_cpu_context; + struct kvm_cpu_context *host_cpu_context; /* VGIC state */ struct vgic_cpu vgic_cpu; @@ -361,6 +368,9 @@ static inline void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) {} +static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} +static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} + static inline void kvm_arm_vhe_guest_enter(void) {} static inline void kvm_arm_vhe_guest_exit(void) {} @@ -409,4 +419,14 @@ static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) return 0; } +static inline int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) +{ + return -EINVAL; +} + +static inline bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) +{ + return true; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/limits.h b/arch/arm/include/asm/limits.h deleted file mode 100644 index ab159371d786..000000000000 --- a/arch/arm/include/asm/limits.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_PIPE_H -#define __ASM_PIPE_H - -#ifndef PAGE_SIZE -#include <asm/page.h> -#endif - -#define PIPE_BUF PAGE_SIZE - -#endif - diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 57fe73ea0f72..5d06f75ffad4 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -135,8 +135,8 @@ static inline void prefetchw(const void *ptr) __asm__ __volatile__( ".arch_extension mp\n" __ALT_SMP_ASM( - WASM(pldw) "\t%a0", - WASM(pld) "\t%a0" + "pldw\t%a0", + "pld\t%a0" ) :: "p" (ptr)); } diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index dff49845eb87..d49ce8f48be3 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -112,10 +112,11 @@ static inline void __user *__uaccess_mask_range_ptr(const void __user *ptr, unsigned long tmp; asm volatile( + " .syntax unified\n" " sub %1, %3, #1\n" " subs %1, %1, %0\n" " addhs %1, %1, #1\n" - " subhss %1, %1, %2\n" + " subshs %1, %1, %2\n" " movlo %0, #0\n" : "+r" (safe_ptr), "=&r" (tmp) : "r" (size), "r" (current_thread_info()->addr_limit) diff --git a/arch/arm/mach-davinci/da830.c b/arch/arm/mach-davinci/da830.c index 63511f638ce4..e6b8ffd934a1 100644 --- a/arch/arm/mach-davinci/da830.c +++ b/arch/arm/mach-davinci/da830.c @@ -12,6 +12,7 @@ #include <linux/clk/davinci.h> #include <linux/gpio.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/irqchip/irq-davinci-cp-intc.h> #include <linux/platform_data/gpio-davinci.h> diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c index 67ab71ba3ad3..77bc64d6e39b 100644 --- a/arch/arm/mach-davinci/da850.c +++ b/arch/arm/mach-davinci/da850.c @@ -18,6 +18,7 @@ #include <linux/cpufreq.h> #include <linux/gpio.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/irqchip/irq-davinci-cp-intc.h> #include <linux/mfd/da8xx-cfgchip.h> #include <linux/platform_data/clk-da8xx-cfgchip.h> diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c index b8dc674e06bc..036139fe0d0f 100644 --- a/arch/arm/mach-davinci/devices-da8xx.c +++ b/arch/arm/mach-davinci/devices-da8xx.c @@ -17,6 +17,7 @@ #include <linux/dma-contiguous.h> #include <linux/dmaengine.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/platform_device.h> #include <linux/reboot.h> #include <linux/serial_8250.h> diff --git a/arch/arm/mach-davinci/dm355.c b/arch/arm/mach-davinci/dm355.c index 4a482445b9a2..c6073326be2e 100644 --- a/arch/arm/mach-davinci/dm355.c +++ b/arch/arm/mach-davinci/dm355.c @@ -15,6 +15,7 @@ #include <linux/dma-mapping.h> #include <linux/dmaengine.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/irqchip/irq-davinci-aintc.h> #include <linux/platform_data/edma.h> #include <linux/platform_data/gpio-davinci.h> diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c index 8e0a77315add..2f9ae6431bf5 100644 --- a/arch/arm/mach-davinci/dm365.c +++ b/arch/arm/mach-davinci/dm365.c @@ -19,6 +19,7 @@ #include <linux/dma-mapping.h> #include <linux/dmaengine.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/irqchip/irq-davinci-aintc.h> #include <linux/platform_data/edma.h> #include <linux/platform_data/gpio-davinci.h> diff --git a/arch/arm/mach-davinci/dm644x.c b/arch/arm/mach-davinci/dm644x.c index cecc7ceb8d34..1b9e9a6192ef 100644 --- a/arch/arm/mach-davinci/dm644x.c +++ b/arch/arm/mach-davinci/dm644x.c @@ -14,6 +14,7 @@ #include <linux/clkdev.h> #include <linux/dmaengine.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/irqchip/irq-davinci-aintc.h> #include <linux/platform_data/edma.h> #include <linux/platform_data/gpio-davinci.h> diff --git a/arch/arm/mach-davinci/dm646x.c b/arch/arm/mach-davinci/dm646x.c index f33392f77a03..62ca952fe161 100644 --- a/arch/arm/mach-davinci/dm646x.c +++ b/arch/arm/mach-davinci/dm646x.c @@ -15,6 +15,7 @@ #include <linux/dma-mapping.h> #include <linux/dmaengine.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/irqchip/irq-davinci-aintc.h> #include <linux/platform_data/edma.h> #include <linux/platform_data/gpio-davinci.h> diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c index 0d420a2bfe3e..d7b826d2695c 100644 --- a/arch/arm/mach-dove/common.c +++ b/arch/arm/mach-dove/common.c @@ -11,6 +11,7 @@ #include <linux/clk-provider.h> #include <linux/dma-mapping.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/of.h> #include <linux/of_platform.h> #include <linux/platform_data/dma-mv_xor.h> diff --git a/arch/arm/mach-mediatek/mediatek.c b/arch/arm/mach-mediatek/mediatek.c index b6a81ba1ce32..5a9c016b3c6c 100644 --- a/arch/arm/mach-mediatek/mediatek.c +++ b/arch/arm/mach-mediatek/mediatek.c @@ -15,6 +15,7 @@ * GNU General Public License for more details. */ #include <linux/init.h> +#include <linux/io.h> #include <asm/mach/arch.h> #include <linux/of.h> #include <linux/clk-provider.h> diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index f72e1e9f5fc5..dd762d1b083f 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/platform_device.h> #include <linux/serial_8250.h> #include <linux/ata_platform.h> diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index c67f92bfa30e..7bcb41137bbf 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/platform_device.h> #include <linux/dma-mapping.h> #include <linux/serial_8250.h> diff --git a/arch/arm/mach-rockchip/rockchip.c b/arch/arm/mach-rockchip/rockchip.c index e41cabc4dc2b..06ab03b93109 100644 --- a/arch/arm/mach-rockchip/rockchip.c +++ b/arch/arm/mach-rockchip/rockchip.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/of_platform.h> #include <linux/irqchip.h> #include <linux/clk-provider.h> diff --git a/arch/arm/mach-zynq/common.c b/arch/arm/mach-zynq/common.c index 6aba9ebf8041..7f634eaeaf10 100644 --- a/arch/arm/mach-zynq/common.c +++ b/arch/arm/mach-zynq/common.c @@ -15,6 +15,7 @@ */ #include <linux/init.h> +#include <linux/io.h> #include <linux/kernel.h> #include <linux/cpumask.h> #include <linux/platform_device.h> diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 68dcd5f8d7c6..be0b42937888 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -182,21 +182,6 @@ int pfn_valid(unsigned long pfn) EXPORT_SYMBOL(pfn_valid); #endif -#ifndef CONFIG_SPARSEMEM -static void __init arm_memory_present(void) -{ -} -#else -static void __init arm_memory_present(void) -{ - struct memblock_region *reg; - - for_each_memblock(memory, reg) - memory_present(0, memblock_region_memory_base_pfn(reg), - memblock_region_memory_end_pfn(reg)); -} -#endif - static bool arm_memblock_steal_permitted = true; phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) @@ -293,7 +278,7 @@ void __init bootmem_init(void) * Sparsemem tries to allocate bootmem in memory_present(), * so must be done after the fixed reservations */ - arm_memory_present(); + memblocks_present(); /* * sparse_init() needs the bootmem allocator up and running. diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 0393917eaa57..aaf479a9e92d 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -441,3 +441,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index f4efff9d3afb..fadf554d9391 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -10,12 +10,12 @@ obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) ccflags-y := -fPIC -fno-common -fno-builtin -fno-stack-protector ccflags-y += -DDISABLE_BRANCH_PROFILING -VDSO_LDFLAGS := -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 -VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 -VDSO_LDFLAGS += -nostdlib -shared -VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) -VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id) -VDSO_LDFLAGS += $(call cc-ldoption, -fuse-ld=bfd) +ldflags-y = -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ + -z max-page-size=4096 -z common-page-size=4096 \ + -nostdlib -shared \ + $(call ld-option, --hash-style=sysv) \ + $(call ld-option, --build-id) \ + -T obj-$(CONFIG_VDSO) += vdso.o extra-$(CONFIG_VDSO) += vdso.lds @@ -37,8 +37,8 @@ KCOV_INSTRUMENT := n $(obj)/vdso.o : $(obj)/vdso.so # Link rule for the .so file -$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE - $(call if_changed,vdsold) +$(obj)/vdso.so.raw: $(obj)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,ld) $(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/vdsomunge FORCE $(call if_changed,vdsomunge) @@ -48,11 +48,6 @@ $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) -# Actual build commands -quiet_cmd_vdsold = VDSO $@ - cmd_vdsold = $(CC) $(c_flags) $(VDSO_LDFLAGS) \ - -Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@ - quiet_cmd_vdsomunge = MUNGE $@ cmd_vdsomunge = $(objtree)/$(obj)/vdsomunge $< $@ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 69a59a5d1143..4780eb7af842 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1341,6 +1341,7 @@ menu "ARMv8.3 architectural features" config ARM64_PTR_AUTH bool "Enable support for pointer authentication" default y + depends on !KVM || ARM64_VHE help Pointer authentication (part of the ARMv8.3 Extensions) provides instructions for signing and authenticating pointers against secret @@ -1354,8 +1355,9 @@ config ARM64_PTR_AUTH context-switched along with the process. The feature is detected at runtime. If the feature is not present in - hardware it will not be advertised to userspace nor will it be - enabled. + hardware it will not be advertised to userspace/KVM guest nor will it + be enabled. However, KVM guest also require VHE mode and hence + CONFIG_ARM64_VHE=y option to use this feature. endmenu diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 9e977dedf193..1de6e05ce48b 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += mmiowb.h generic-y += msi.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += segment.h generic-y += serial.h generic-y += set_memory.h generic-y += switch_to.h diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index dd1ad3950ef5..df62bbd33a9a 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -24,10 +24,13 @@ #ifndef __ASSEMBLY__ +#include <linux/bitmap.h> #include <linux/build_bug.h> +#include <linux/bug.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/stddef.h> +#include <linux/types.h> #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ @@ -56,7 +59,8 @@ extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); extern void fpsimd_bind_task_to_cpu(void); -extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state); +extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, + void *sve_state, unsigned int sve_vl); extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_flush_cpu_state(void); @@ -87,6 +91,29 @@ extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused); extern u64 read_zcr_features(void); extern int __ro_after_init sve_max_vl; +extern int __ro_after_init sve_max_virtualisable_vl; +extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); + +/* + * Helpers to translate bit indices in sve_vq_map to VQ values (and + * vice versa). This allows find_next_bit() to be used to find the + * _maximum_ VQ not exceeding a certain value. + */ +static inline unsigned int __vq_to_bit(unsigned int vq) +{ + return SVE_VQ_MAX - vq; +} + +static inline unsigned int __bit_to_vq(unsigned int bit) +{ + return SVE_VQ_MAX - bit; +} + +/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */ +static inline bool sve_vq_available(unsigned int vq) +{ + return test_bit(__vq_to_bit(vq), sve_vq_map); +} #ifdef CONFIG_ARM64_SVE diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index f5b79e995f40..ff73f5462aca 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -108,7 +108,8 @@ extern u32 __kvm_get_mdcr_el2(void); .endm .macro get_host_ctxt reg, tmp - hyp_adr_this_cpu \reg, kvm_host_cpu_state, \tmp + hyp_adr_this_cpu \reg, kvm_host_data, \tmp + add \reg, \reg, #HOST_DATA_CONTEXT .endm .macro get_vcpu_ptr vcpu, ctxt diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index d3842791e1c4..613427fafff9 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -98,6 +98,22 @@ static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_TWE; } +static inline void vcpu_ptrauth_enable(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); +} + +static inline void vcpu_ptrauth_disable(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); +} + +static inline void vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_ptrauth(vcpu)) + vcpu_ptrauth_disable(vcpu); +} + static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu) { return vcpu->arch.vsesr_el2; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a01fe087e022..2a8d3f8ca22c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -22,9 +22,13 @@ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ +#include <linux/bitmap.h> #include <linux/types.h> +#include <linux/jump_label.h> #include <linux/kvm_types.h> +#include <linux/percpu.h> #include <asm/arch_gicv3.h> +#include <asm/barrier.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/fpsimd.h> @@ -45,7 +49,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#define KVM_VCPU_MAX_FEATURES 4 +#define KVM_VCPU_MAX_FEATURES 7 #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) @@ -54,8 +58,12 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); +extern unsigned int kvm_sve_max_vl; +int kvm_arm_init_sve(void); + int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); @@ -117,6 +125,7 @@ enum vcpu_sysreg { SCTLR_EL1, /* System Control Register */ ACTLR_EL1, /* Auxiliary Control Register */ CPACR_EL1, /* Coprocessor Access Control */ + ZCR_EL1, /* SVE Control */ TTBR0_EL1, /* Translation Table Base Register 0 */ TTBR1_EL1, /* Translation Table Base Register 1 */ TCR_EL1, /* Translation Control Register */ @@ -152,6 +161,18 @@ enum vcpu_sysreg { PMSWINC_EL0, /* Software Increment Register */ PMUSERENR_EL0, /* User Enable Register */ + /* Pointer Authentication Registers in a strict increasing order. */ + APIAKEYLO_EL1, + APIAKEYHI_EL1, + APIBKEYLO_EL1, + APIBKEYHI_EL1, + APDAKEYLO_EL1, + APDAKEYHI_EL1, + APDBKEYLO_EL1, + APDBKEYHI_EL1, + APGAKEYLO_EL1, + APGAKEYHI_EL1, + /* 32bit specific registers. Keep them at the end of the range */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -212,7 +233,17 @@ struct kvm_cpu_context { struct kvm_vcpu *__hyp_running_vcpu; }; -typedef struct kvm_cpu_context kvm_cpu_context_t; +struct kvm_pmu_events { + u32 events_host; + u32 events_guest; +}; + +struct kvm_host_data { + struct kvm_cpu_context host_ctxt; + struct kvm_pmu_events pmu_events; +}; + +typedef struct kvm_host_data kvm_host_data_t; struct vcpu_reset_state { unsigned long pc; @@ -223,6 +254,8 @@ struct vcpu_reset_state { struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; + void *sve_state; + unsigned int sve_max_vl; /* HYP configuration */ u64 hcr_el2; @@ -255,7 +288,7 @@ struct kvm_vcpu_arch { struct kvm_guest_debug_arch external_debug_state; /* Pointer to host CPU context */ - kvm_cpu_context_t *host_cpu_context; + struct kvm_cpu_context *host_cpu_context; struct thread_info *host_thread_info; /* hyp VA */ struct user_fpsimd_state *host_fpsimd_state; /* hyp VA */ @@ -318,12 +351,40 @@ struct kvm_vcpu_arch { bool sysregs_loaded_on_cpu; }; +/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ +#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \ + sve_ffr_offset((vcpu)->arch.sve_max_vl))) + +#define vcpu_sve_state_size(vcpu) ({ \ + size_t __size_ret; \ + unsigned int __vcpu_vq; \ + \ + if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \ + __size_ret = 0; \ + } else { \ + __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl); \ + __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \ + } \ + \ + __size_ret; \ +}) + /* vcpu_arch flags field values: */ #define KVM_ARM64_DEBUG_DIRTY (1 << 0) #define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ #define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ #define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */ #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ +#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ +#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ +#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ + +#define vcpu_has_sve(vcpu) (system_supports_sve() && \ + ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) + +#define vcpu_has_ptrauth(vcpu) ((system_supports_address_auth() || \ + system_supports_generic_auth()) && \ + ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH)) #define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs) @@ -432,9 +493,9 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); +DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data); -static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, +static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt, int cpu) { /* The host's MPIDR is immutable, so let's set it up at boot time */ @@ -452,8 +513,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. */ - u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_cpu_state) - - (u64)kvm_ksym_ref(kvm_host_cpu_state)); + u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_data) - + (u64)kvm_ksym_ref(kvm_host_data)); /* * Call initialization code, and switch to the full blown HYP code. @@ -491,9 +552,10 @@ static inline bool kvm_arch_requires_vhe(void) return false; } +void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); + static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} @@ -516,11 +578,28 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu); +static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) +{ + return (!has_vhe() && attr->exclude_host); +} + #ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { return kvm_arch_vcpu_run_map_fp(vcpu); } + +void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); +void kvm_clr_pmu_events(u32 clr); + +void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt); +bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt); + +void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); +void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); +#else +static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} +static inline void kvm_clr_pmu_events(u32 clr) {} #endif static inline void kvm_arm_vhe_guest_enter(void) @@ -594,4 +673,10 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); +bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); + +#define kvm_arm_vcpu_sve_finalized(vcpu) \ + ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index c3060833b7a5..09fe8bd15f6e 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -149,7 +149,6 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); -bool __fpsimd_enabled(void); void activate_traps_vhe_load(struct kvm_vcpu *vcpu); void deactivate_traps_vhe_put(void); diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h new file mode 100644 index 000000000000..6301813dcace --- /dev/null +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* arch/arm64/include/asm/kvm_ptrauth.h: Guest/host ptrauth save/restore + * Copyright 2019 Arm Limited + * Authors: Mark Rutland <mark.rutland@arm.com> + * Amit Daniel Kachhap <amit.kachhap@arm.com> + */ + +#ifndef __ASM_KVM_PTRAUTH_H +#define __ASM_KVM_PTRAUTH_H + +#ifdef __ASSEMBLY__ + +#include <asm/sysreg.h> + +#ifdef CONFIG_ARM64_PTR_AUTH + +#define PTRAUTH_REG_OFFSET(x) (x - CPU_APIAKEYLO_EL1) + +/* + * CPU_AP*_EL1 values exceed immediate offset range (512) for stp + * instruction so below macros takes CPU_APIAKEYLO_EL1 as base and + * calculates the offset of the keys from this base to avoid an extra add + * instruction. These macros assumes the keys offsets follow the order of + * the sysreg enum in kvm_host.h. + */ +.macro ptrauth_save_state base, reg1, reg2 + mrs_s \reg1, SYS_APIAKEYLO_EL1 + mrs_s \reg2, SYS_APIAKEYHI_EL1 + stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIAKEYLO_EL1)] + mrs_s \reg1, SYS_APIBKEYLO_EL1 + mrs_s \reg2, SYS_APIBKEYHI_EL1 + stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIBKEYLO_EL1)] + mrs_s \reg1, SYS_APDAKEYLO_EL1 + mrs_s \reg2, SYS_APDAKEYHI_EL1 + stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDAKEYLO_EL1)] + mrs_s \reg1, SYS_APDBKEYLO_EL1 + mrs_s \reg2, SYS_APDBKEYHI_EL1 + stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDBKEYLO_EL1)] + mrs_s \reg1, SYS_APGAKEYLO_EL1 + mrs_s \reg2, SYS_APGAKEYHI_EL1 + stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APGAKEYLO_EL1)] +.endm + +.macro ptrauth_restore_state base, reg1, reg2 + ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIAKEYLO_EL1)] + msr_s SYS_APIAKEYLO_EL1, \reg1 + msr_s SYS_APIAKEYHI_EL1, \reg2 + ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIBKEYLO_EL1)] + msr_s SYS_APIBKEYLO_EL1, \reg1 + msr_s SYS_APIBKEYHI_EL1, \reg2 + ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDAKEYLO_EL1)] + msr_s SYS_APDAKEYLO_EL1, \reg1 + msr_s SYS_APDAKEYHI_EL1, \reg2 + ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDBKEYLO_EL1)] + msr_s SYS_APDBKEYLO_EL1, \reg1 + msr_s SYS_APDBKEYHI_EL1, \reg2 + ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APGAKEYLO_EL1)] + msr_s SYS_APGAKEYLO_EL1, \reg1 + msr_s SYS_APGAKEYHI_EL1, \reg2 +.endm + +/* + * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will + * check for the presence of one of the cpufeature flag + * ARM64_HAS_ADDRESS_AUTH_ARCH or ARM64_HAS_ADDRESS_AUTH_IMP_DEF and + * then proceed ahead with the save/restore of Pointer Authentication + * key registers. + */ +.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3 +alternative_if ARM64_HAS_ADDRESS_AUTH_ARCH + b 1000f +alternative_else_nop_endif +alternative_if_not ARM64_HAS_ADDRESS_AUTH_IMP_DEF + b 1001f +alternative_else_nop_endif +1000: + ldr \reg1, [\g_ctxt, #(VCPU_HCR_EL2 - VCPU_CONTEXT)] + and \reg1, \reg1, #(HCR_API | HCR_APK) + cbz \reg1, 1001f + add \reg1, \g_ctxt, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state \reg1, \reg2, \reg3 +1001: +.endm + +.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +alternative_if ARM64_HAS_ADDRESS_AUTH_ARCH + b 2000f +alternative_else_nop_endif +alternative_if_not ARM64_HAS_ADDRESS_AUTH_IMP_DEF + b 2001f +alternative_else_nop_endif +2000: + ldr \reg1, [\g_ctxt, #(VCPU_HCR_EL2 - VCPU_CONTEXT)] + and \reg1, \reg1, #(HCR_API | HCR_APK) + cbz \reg1, 2001f + add \reg1, \g_ctxt, #CPU_APIAKEYLO_EL1 + ptrauth_save_state \reg1, \reg2, \reg3 + add \reg1, \h_ctxt, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state \reg1, \reg2, \reg3 + isb +2001: +.endm + +#else /* !CONFIG_ARM64_PTR_AUTH */ +.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3 +.endm +.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.endm +#endif /* CONFIG_ARM64_PTR_AUTH */ +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_KVM_PTRAUTH_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 3f7b917e8f3a..902d75b60914 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -454,6 +454,9 @@ #define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6) #define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7) +/* VHE encodings for architectural EL0/1 system registers */ +#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) + /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_DSSBS (_BITUL(44)) #define SCTLR_ELx_ENIA (_BITUL(31)) diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index f2a83ff6b73c..70e6882853c0 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 428 +#define __NR_compat_syscalls 434 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 23f1a44acada..c39e90600bb3 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -874,6 +874,18 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #define __NR_io_uring_register 427 __SYSCALL(__NR_io_uring_register, sys_io_uring_register) +#define __NR_open_tree 428 +__SYSCALL(__NR_open_tree, sys_open_tree) +#define __NR_move_mount 429 +__SYSCALL(__NR_move_mount, sys_move_mount) +#define __NR_fsopen 430 +__SYSCALL(__NR_fsopen, sys_fsopen) +#define __NR_fsconfig 431 +__SYSCALL(__NR_fsconfig, sys_fsconfig) +#define __NR_fsmount 432 +__SYSCALL(__NR_fsmount, sys_fsmount) +#define __NR_fspick 433 +__SYSCALL(__NR_fspick, sys_fspick) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 97c3478ee6e7..7b7ac0f6cec9 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -35,6 +35,7 @@ #include <linux/psci.h> #include <linux/types.h> #include <asm/ptrace.h> +#include <asm/sve_context.h> #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE @@ -102,6 +103,9 @@ struct kvm_regs { #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ #define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ #define KVM_ARM_VCPU_PMU_V3 3 /* Support guest PMUv3 */ +#define KVM_ARM_VCPU_SVE 4 /* enable SVE for this CPU */ +#define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ +#define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ struct kvm_vcpu_init { __u32 target; @@ -226,6 +230,45 @@ struct kvm_vcpu_events { KVM_REG_ARM_FW | ((r) & 0xffff)) #define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) +/* SVE registers */ +#define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) + +/* Z- and P-regs occupy blocks at the following offsets within this range: */ +#define KVM_REG_ARM64_SVE_ZREG_BASE 0 +#define KVM_REG_ARM64_SVE_PREG_BASE 0x400 +#define KVM_REG_ARM64_SVE_FFR_BASE 0x600 + +#define KVM_ARM64_SVE_NUM_ZREGS __SVE_NUM_ZREGS +#define KVM_ARM64_SVE_NUM_PREGS __SVE_NUM_PREGS + +#define KVM_ARM64_SVE_MAX_SLICES 32 + +#define KVM_REG_ARM64_SVE_ZREG(n, i) \ + (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_ZREG_BASE | \ + KVM_REG_SIZE_U2048 | \ + (((n) & (KVM_ARM64_SVE_NUM_ZREGS - 1)) << 5) | \ + ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1))) + +#define KVM_REG_ARM64_SVE_PREG(n, i) \ + (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_PREG_BASE | \ + KVM_REG_SIZE_U256 | \ + (((n) & (KVM_ARM64_SVE_NUM_PREGS - 1)) << 5) | \ + ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1))) + +#define KVM_REG_ARM64_SVE_FFR(i) \ + (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_FFR_BASE | \ + KVM_REG_SIZE_U256 | \ + ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1))) + +#define KVM_ARM64_SVE_VQ_MIN __SVE_VQ_MIN +#define KVM_ARM64_SVE_VQ_MAX __SVE_VQ_MAX + +/* Vector lengths pseudo-register: */ +#define KVM_REG_ARM64_SVE_VLS (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \ + KVM_REG_SIZE_U512 | 0xffff) +#define KVM_ARM64_SVE_VLS_WORDS \ + ((KVM_ARM64_SVE_VQ_MAX - KVM_ARM64_SVE_VQ_MIN) / 64 + 1) + /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e10e2a5d9ddc..947e39896e28 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -125,9 +125,16 @@ int main(void) DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); + DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); + DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1])); + DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1])); + DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1])); + DEFINE(CPU_APDBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDBKEYLO_EL1])); + DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); + DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 2b807f129e60..ca27e08e3d8a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1913,7 +1913,7 @@ static void verify_sve_features(void) unsigned int len = zcr & ZCR_ELx_LEN_MASK; if (len < safe_len || sve_verify_vq_map()) { - pr_crit("CPU%d: SVE: required vector length(s) missing\n", + pr_crit("CPU%d: SVE: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 735cf1f8b109..a38bf74bcca8 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -18,6 +18,7 @@ */ #include <linux/bitmap.h> +#include <linux/bitops.h> #include <linux/bottom_half.h> #include <linux/bug.h> #include <linux/cache.h> @@ -48,6 +49,7 @@ #include <asm/sigcontext.h> #include <asm/sysreg.h> #include <asm/traps.h> +#include <asm/virt.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -119,6 +121,8 @@ */ struct fpsimd_last_state_struct { struct user_fpsimd_state *st; + void *sve_state; + unsigned int sve_vl; }; static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); @@ -130,14 +134,23 @@ static int sve_default_vl = -1; /* Maximum supported vector length across all CPUs (initially poisoned) */ int __ro_after_init sve_max_vl = SVE_VL_MIN; -/* Set of available vector lengths, as vq_to_bit(vq): */ -static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN; + +/* + * Set of available vector lengths, + * where length vq encoded as bit __vq_to_bit(vq): + */ +__ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +/* Set of vector lengths present on at least one cpu: */ +static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); + static void __percpu *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); +extern __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ @@ -235,14 +248,15 @@ static void task_fpsimd_load(void) */ void fpsimd_save(void) { - struct user_fpsimd_state *st = __this_cpu_read(fpsimd_last_state.st); + struct fpsimd_last_state_struct const *last = + this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ WARN_ON(!in_softirq() && !irqs_disabled()); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { if (system_supports_sve() && test_thread_flag(TIF_SVE)) { - if (WARN_ON(sve_get_vl() != current->thread.sve_vl)) { + if (WARN_ON(sve_get_vl() != last->sve_vl)) { /* * Can't save the user regs, so current would * re-enter user with corrupt state. @@ -252,32 +266,15 @@ void fpsimd_save(void) return; } - sve_save_state(sve_pffr(¤t->thread), &st->fpsr); + sve_save_state((char *)last->sve_state + + sve_ffr_offset(last->sve_vl), + &last->st->fpsr); } else - fpsimd_save_state(st); + fpsimd_save_state(last->st); } } /* - * Helpers to translate bit indices in sve_vq_map to VQ values (and - * vice versa). This allows find_next_bit() to be used to find the - * _maximum_ VQ not exceeding a certain value. - */ - -static unsigned int vq_to_bit(unsigned int vq) -{ - return SVE_VQ_MAX - vq; -} - -static unsigned int bit_to_vq(unsigned int bit) -{ - if (WARN_ON(bit >= SVE_VQ_MAX)) - bit = SVE_VQ_MAX - 1; - - return SVE_VQ_MAX - bit; -} - -/* * All vector length selection from userspace comes through here. * We're on a slow path, so some sanity-checks are included. * If things go wrong there's a bug somewhere, but try to fall back to a @@ -298,8 +295,8 @@ static unsigned int find_supported_vector_length(unsigned int vl) vl = max_vl; bit = find_next_bit(sve_vq_map, SVE_VQ_MAX, - vq_to_bit(sve_vq_from_vl(vl))); - return sve_vl_from_vq(bit_to_vq(bit)); + __vq_to_bit(sve_vq_from_vl(vl))); + return sve_vl_from_vq(__bit_to_vq(bit)); } #ifdef CONFIG_SYSCTL @@ -550,7 +547,6 @@ int sve_set_vector_length(struct task_struct *task, local_bh_disable(); fpsimd_save(); - set_thread_flag(TIF_FOREIGN_FPSTATE); } fpsimd_flush_task_state(task); @@ -624,12 +620,6 @@ int sve_get_current_vl(void) return sve_prctl_status(0); } -/* - * Bitmap for temporary storage of the per-CPU set of supported vector lengths - * during secondary boot. - */ -static DECLARE_BITMAP(sve_secondary_vq_map, SVE_VQ_MAX); - static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) { unsigned int vq, vl; @@ -644,40 +634,82 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */ vl = sve_get_vl(); vq = sve_vq_from_vl(vl); /* skip intervening lengths */ - set_bit(vq_to_bit(vq), map); + set_bit(__vq_to_bit(vq), map); } } +/* + * Initialise the set of known supported VQs for the boot CPU. + * This is called during kernel boot, before secondary CPUs are brought up. + */ void __init sve_init_vq_map(void) { sve_probe_vqs(sve_vq_map); + bitmap_copy(sve_vq_partial_map, sve_vq_map, SVE_VQ_MAX); } /* * If we haven't committed to the set of supported VQs yet, filter out * those not supported by the current CPU. + * This function is called during the bring-up of early secondary CPUs only. */ void sve_update_vq_map(void) { - sve_probe_vqs(sve_secondary_vq_map); - bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX); + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + + sve_probe_vqs(tmp_map); + bitmap_and(sve_vq_map, sve_vq_map, tmp_map, SVE_VQ_MAX); + bitmap_or(sve_vq_partial_map, sve_vq_partial_map, tmp_map, SVE_VQ_MAX); } -/* Check whether the current CPU supports all VQs in the committed set */ +/* + * Check whether the current CPU supports all VQs in the committed set. + * This function is called during the bring-up of late secondary CPUs only. + */ int sve_verify_vq_map(void) { - int ret = 0; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; - sve_probe_vqs(sve_secondary_vq_map); - bitmap_andnot(sve_secondary_vq_map, sve_vq_map, sve_secondary_vq_map, - SVE_VQ_MAX); - if (!bitmap_empty(sve_secondary_vq_map, SVE_VQ_MAX)) { + sve_probe_vqs(tmp_map); + + bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); + if (bitmap_intersects(tmp_map, sve_vq_map, SVE_VQ_MAX)) { pr_warn("SVE: cpu%d: Required vector length(s) missing\n", smp_processor_id()); - ret = -EINVAL; + return -EINVAL; } - return ret; + if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available()) + return 0; + + /* + * For KVM, it is necessary to ensure that this CPU doesn't + * support any vector length that guests may have probed as + * unsupported. + */ + + /* Recover the set of supported VQs: */ + bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); + /* Find VQs supported that are not globally supported: */ + bitmap_andnot(tmp_map, tmp_map, sve_vq_map, SVE_VQ_MAX); + + /* Find the lowest such VQ, if any: */ + b = find_last_bit(tmp_map, SVE_VQ_MAX); + if (b >= SVE_VQ_MAX) + return 0; /* no mismatches */ + + /* + * Mismatches above sve_max_virtualisable_vl are fine, since + * no guest is allowed to configure ZCR_EL2.LEN to exceed this: + */ + if (sve_vl_from_vq(__bit_to_vq(b)) <= sve_max_virtualisable_vl) { + pr_warn("SVE: cpu%d: Unsupported vector length(s) present\n", + smp_processor_id()); + return -EINVAL; + } + + return 0; } static void __init sve_efi_setup(void) @@ -744,6 +776,8 @@ u64 read_zcr_features(void) void __init sve_setup(void) { u64 zcr; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; if (!system_supports_sve()) return; @@ -753,8 +787,8 @@ void __init sve_setup(void) * so sve_vq_map must have at least SVE_VQ_MIN set. * If something went wrong, at least try to patch it up: */ - if (WARN_ON(!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map))) - set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map); + if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map))) + set_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map); zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); @@ -772,11 +806,31 @@ void __init sve_setup(void) */ sve_default_vl = find_supported_vector_length(64); + bitmap_andnot(tmp_map, sve_vq_partial_map, sve_vq_map, + SVE_VQ_MAX); + + b = find_last_bit(tmp_map, SVE_VQ_MAX); + if (b >= SVE_VQ_MAX) + /* No non-virtualisable VLs found */ + sve_max_virtualisable_vl = SVE_VQ_MAX; + else if (WARN_ON(b == SVE_VQ_MAX - 1)) + /* No virtualisable VLs? This is architecturally forbidden. */ + sve_max_virtualisable_vl = SVE_VQ_MIN; + else /* b + 1 < SVE_VQ_MAX */ + sve_max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); + + if (sve_max_virtualisable_vl > sve_max_vl) + sve_max_virtualisable_vl = sve_max_vl; + pr_info("SVE: maximum available vector length %u bytes per vector\n", sve_max_vl); pr_info("SVE: default vector length %u bytes per vector\n", sve_default_vl); + /* KVM decides whether to support mismatched systems. Just warn here: */ + if (sve_max_virtualisable_vl < sve_max_vl) + pr_warn("SVE: unvirtualisable vector lengths present\n"); + sve_efi_setup(); } @@ -816,12 +870,11 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) local_bh_disable(); fpsimd_save(); - fpsimd_to_sve(current); /* Force ret_to_user to reload the registers: */ fpsimd_flush_task_state(current); - set_thread_flag(TIF_FOREIGN_FPSTATE); + fpsimd_to_sve(current); if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ @@ -894,9 +947,9 @@ void fpsimd_flush_thread(void) local_bh_disable(); + fpsimd_flush_task_state(current); memset(¤t->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); - fpsimd_flush_task_state(current); if (system_supports_sve()) { clear_thread_flag(TIF_SVE); @@ -933,8 +986,6 @@ void fpsimd_flush_thread(void) current->thread.sve_vl_onexec = 0; } - set_thread_flag(TIF_FOREIGN_FPSTATE); - local_bh_enable(); } @@ -974,6 +1025,8 @@ void fpsimd_bind_task_to_cpu(void) this_cpu_ptr(&fpsimd_last_state); last->st = ¤t->thread.uw.fpsimd_state; + last->sve_state = current->thread.sve_state; + last->sve_vl = current->thread.sve_vl; current->thread.fpsimd_cpu = smp_processor_id(); if (system_supports_sve()) { @@ -987,7 +1040,8 @@ void fpsimd_bind_task_to_cpu(void) } } -void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st) +void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, + unsigned int sve_vl) { struct fpsimd_last_state_struct *last = this_cpu_ptr(&fpsimd_last_state); @@ -995,6 +1049,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st) WARN_ON(!in_softirq() && !irqs_disabled()); last->st = st; + last->sve_state = sve_state; + last->sve_vl = sve_vl; } /* @@ -1043,12 +1099,29 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) /* * Invalidate live CPU copies of task t's FPSIMD state + * + * This function may be called with preemption enabled. The barrier() + * ensures that the assignment to fpsimd_cpu is visible to any + * preemption/softirq that could race with set_tsk_thread_flag(), so + * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared. + * + * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any + * subsequent code. */ void fpsimd_flush_task_state(struct task_struct *t) { t->thread.fpsimd_cpu = NR_CPUS; + + barrier(); + set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE); + + barrier(); } +/* + * Invalidate any task's FPSIMD state that is present on this cpu. + * This function must be called with softirqs disabled. + */ void fpsimd_flush_cpu_state(void) { __this_cpu_write(fpsimd_last_state.st, NULL); diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 6164d389eed6..348d12eec566 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -26,6 +26,7 @@ #include <linux/acpi.h> #include <linux/clocksource.h> +#include <linux/kvm_host.h> #include <linux/of.h> #include <linux/perf/arm_pmu.h> #include <linux/platform_device.h> @@ -528,12 +529,21 @@ static inline int armv8pmu_enable_counter(int idx) static inline void armv8pmu_enable_event_counter(struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; int idx = event->hw.idx; + u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx)); - armv8pmu_enable_counter(idx); if (armv8pmu_event_is_chained(event)) - armv8pmu_enable_counter(idx - 1); - isb(); + counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1)); + + kvm_set_pmu_events(counter_bits, attr); + + /* We rely on the hypervisor switch code to enable guest counters */ + if (!kvm_pmu_counter_deferred(attr)) { + armv8pmu_enable_counter(idx); + if (armv8pmu_event_is_chained(event)) + armv8pmu_enable_counter(idx - 1); + } } static inline int armv8pmu_disable_counter(int idx) @@ -546,11 +556,21 @@ static inline int armv8pmu_disable_counter(int idx) static inline void armv8pmu_disable_event_counter(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_event_attr *attr = &event->attr; int idx = hwc->idx; + u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx)); if (armv8pmu_event_is_chained(event)) - armv8pmu_disable_counter(idx - 1); - armv8pmu_disable_counter(idx); + counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1)); + + kvm_clr_pmu_events(counter_bits); + + /* We rely on the hypervisor switch code to disable guest counters */ + if (!kvm_pmu_counter_deferred(attr)) { + if (armv8pmu_event_is_chained(event)) + armv8pmu_disable_counter(idx - 1); + armv8pmu_disable_counter(idx); + } } static inline int armv8pmu_enable_intens(int idx) @@ -827,14 +847,23 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, * with other architectures (x86 and Power). */ if (is_kernel_in_hyp_mode()) { - if (!attr->exclude_kernel) + if (!attr->exclude_kernel && !attr->exclude_host) config_base |= ARMV8_PMU_INCLUDE_EL2; - } else { - if (attr->exclude_kernel) + if (attr->exclude_guest) config_base |= ARMV8_PMU_EXCLUDE_EL1; - if (!attr->exclude_hv) + if (attr->exclude_host) + config_base |= ARMV8_PMU_EXCLUDE_EL0; + } else { + if (!attr->exclude_hv && !attr->exclude_host) config_base |= ARMV8_PMU_INCLUDE_EL2; } + + /* + * Filter out !VHE kernels and guest kernels + */ + if (attr->exclude_kernel) + config_base |= ARMV8_PMU_EXCLUDE_EL1; + if (attr->exclude_user) config_base |= ARMV8_PMU_EXCLUDE_EL0; @@ -864,6 +893,9 @@ static void armv8pmu_reset(void *info) armv8pmu_disable_intens(idx); } + /* Clear the counters we flip at guest entry/exit */ + kvm_clr_pmu_events(U32_MAX); + /* * Initialize & Reset PMNC. Request overflow interrupt for * 64 bit cycle counter but cheat in armv8pmu_write_counter(). diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 867a7cea70e5..a9b0485df074 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -296,11 +296,6 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) */ fpsimd_flush_task_state(current); - barrier(); - /* From now, fpsimd_thread_switch() won't clear TIF_FOREIGN_FPSTATE */ - - set_thread_flag(TIF_FOREIGN_FPSTATE); - barrier(); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ sve_alloc(current); diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 690e033a91c0..3ac1a64d2fb9 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o -kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index aac7808ce216..6e3c9c8b2df9 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/thread_info.h> #include <linux/kvm_host.h> +#include <asm/fpsimd.h> #include <asm/kvm_asm.h> #include <asm/kvm_host.h> #include <asm/kvm_mmu.h> @@ -85,9 +86,12 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!irqs_disabled()); if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { - fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.gp_regs.fp_regs); + fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.gp_regs.fp_regs, + vcpu->arch.sve_state, + vcpu->arch.sve_max_vl); + clear_thread_flag(TIF_FOREIGN_FPSTATE); - clear_thread_flag(TIF_SVE); + update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu)); } } @@ -100,14 +104,21 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) { unsigned long flags; + bool host_has_sve = system_supports_sve(); + bool guest_has_sve = vcpu_has_sve(vcpu); local_irq_save(flags); if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1]; + /* Clean guest FP state to memory and invalidate cpu view */ fpsimd_save(); fpsimd_flush_cpu_state(); - } else if (system_supports_sve()) { + + if (guest_has_sve) + *guest_zcr = read_sysreg_s(SYS_ZCR_EL12); + } else if (host_has_sve) { /* * The FPSIMD/SVE state in the CPU has not been touched, and we * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index dd436a50fce7..3ae2f82fca46 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -19,18 +19,25 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bits.h> #include <linux/errno.h> #include <linux/err.h> +#include <linux/nospec.h> #include <linux/kvm_host.h> #include <linux/module.h> +#include <linux/stddef.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <kvm/arm_psci.h> #include <asm/cputype.h> #include <linux/uaccess.h> +#include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_coproc.h> +#include <asm/kvm_host.h> +#include <asm/sigcontext.h> #include "trace.h" @@ -52,12 +59,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +static bool core_reg_offset_is_vreg(u64 off) +{ + return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && + off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr); +} + static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } -static int validate_core_offset(const struct kvm_one_reg *reg) +static int validate_core_offset(const struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) { u64 off = core_reg_offset_from_id(reg->id); int size; @@ -89,11 +103,19 @@ static int validate_core_offset(const struct kvm_one_reg *reg) return -EINVAL; } - if (KVM_REG_SIZE(reg->id) == size && - IS_ALIGNED(off, size / sizeof(__u32))) - return 0; + if (KVM_REG_SIZE(reg->id) != size || + !IS_ALIGNED(off, size / sizeof(__u32))) + return -EINVAL; - return -EINVAL; + /* + * The KVM_REG_ARM64_SVE regs must be used instead of + * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on + * SVE-enabled vcpus: + */ + if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off)) + return -EINVAL; + + return 0; } static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) @@ -115,7 +137,7 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; - if (validate_core_offset(reg)) + if (validate_core_offset(vcpu, reg)) return -EINVAL; if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id))) @@ -140,7 +162,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; - if (validate_core_offset(reg)) + if (validate_core_offset(vcpu, reg)) return -EINVAL; if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) @@ -183,6 +205,239 @@ out: return err; } +#define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) +#define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) + +static bool vq_present( + const u64 (*const vqs)[KVM_ARM64_SVE_VLS_WORDS], + unsigned int vq) +{ + return (*vqs)[vq_word(vq)] & vq_mask(vq); +} + +static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + unsigned int max_vq, vq; + u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; + + if (!vcpu_has_sve(vcpu)) + return -ENOENT; + + if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl))) + return -EINVAL; + + memset(vqs, 0, sizeof(vqs)); + + max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); + for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) + if (sve_vq_available(vq)) + vqs[vq_word(vq)] |= vq_mask(vq); + + if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs))) + return -EFAULT; + + return 0; +} + +static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + unsigned int max_vq, vq; + u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; + + if (!vcpu_has_sve(vcpu)) + return -ENOENT; + + if (kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; /* too late! */ + + if (WARN_ON(vcpu->arch.sve_state)) + return -EINVAL; + + if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs))) + return -EFAULT; + + max_vq = 0; + for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq) + if (vq_present(&vqs, vq)) + max_vq = vq; + + if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) + return -EINVAL; + + /* + * Vector lengths supported by the host can't currently be + * hidden from the guest individually: instead we can only set a + * maxmium via ZCR_EL2.LEN. So, make sure the available vector + * lengths match the set requested exactly up to the requested + * maximum: + */ + for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) + if (vq_present(&vqs, vq) != sve_vq_available(vq)) + return -EINVAL; + + /* Can't run with no vector lengths at all: */ + if (max_vq < SVE_VQ_MIN) + return -EINVAL; + + /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */ + vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq); + + return 0; +} + +#define SVE_REG_SLICE_SHIFT 0 +#define SVE_REG_SLICE_BITS 5 +#define SVE_REG_ID_SHIFT (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS) +#define SVE_REG_ID_BITS 5 + +#define SVE_REG_SLICE_MASK \ + GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1, \ + SVE_REG_SLICE_SHIFT) +#define SVE_REG_ID_MASK \ + GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT) + +#define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS) + +#define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0)) +#define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0)) + +/* + * Number of register slices required to cover each whole SVE register. + * NOTE: Only the first slice every exists, for now. + * If you are tempted to modify this, you must also rework sve_reg_to_region() + * to match: + */ +#define vcpu_sve_slices(vcpu) 1 + +/* Bounds of a single SVE register slice within vcpu->arch.sve_state */ +struct sve_state_reg_region { + unsigned int koffset; /* offset into sve_state in kernel memory */ + unsigned int klen; /* length in kernel memory */ + unsigned int upad; /* extra trailing padding in user memory */ +}; + +/* + * Validate SVE register ID and get sanitised bounds for user/kernel SVE + * register copy + */ +static int sve_reg_to_region(struct sve_state_reg_region *region, + struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + /* reg ID ranges for Z- registers */ + const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0); + const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1, + SVE_NUM_SLICES - 1); + + /* reg ID ranges for P- registers and FFR (which are contiguous) */ + const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0); + const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1); + + unsigned int vq; + unsigned int reg_num; + + unsigned int reqoffset, reqlen; /* User-requested offset and length */ + unsigned int maxlen; /* Maxmimum permitted length */ + + size_t sve_state_size; + + const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1, + SVE_NUM_SLICES - 1); + + /* Verify that the P-regs and FFR really do have contiguous IDs: */ + BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1); + + /* Verify that we match the UAPI header: */ + BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES); + + reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT; + + if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) { + if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) + return -ENOENT; + + vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); + + reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - + SVE_SIG_REGS_OFFSET; + reqlen = KVM_SVE_ZREG_SIZE; + maxlen = SVE_SIG_ZREG_SIZE(vq); + } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) { + if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) + return -ENOENT; + + vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); + + reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - + SVE_SIG_REGS_OFFSET; + reqlen = KVM_SVE_PREG_SIZE; + maxlen = SVE_SIG_PREG_SIZE(vq); + } else { + return -EINVAL; + } + + sve_state_size = vcpu_sve_state_size(vcpu); + if (WARN_ON(!sve_state_size)) + return -EINVAL; + + region->koffset = array_index_nospec(reqoffset, sve_state_size); + region->klen = min(maxlen, reqlen); + region->upad = reqlen - region->klen; + + return 0; +} + +static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret; + struct sve_state_reg_region region; + char __user *uptr = (char __user *)reg->addr; + + /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ + if (reg->id == KVM_REG_ARM64_SVE_VLS) + return get_sve_vls(vcpu, reg); + + /* Try to interpret reg ID as an architectural SVE register... */ + ret = sve_reg_to_region(®ion, vcpu, reg); + if (ret) + return ret; + + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; + + if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset, + region.klen) || + clear_user(uptr + region.klen, region.upad)) + return -EFAULT; + + return 0; +} + +static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret; + struct sve_state_reg_region region; + const char __user *uptr = (const char __user *)reg->addr; + + /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ + if (reg->id == KVM_REG_ARM64_SVE_VLS) + return set_sve_vls(vcpu, reg); + + /* Try to interpret reg ID as an architectural SVE register... */ + ret = sve_reg_to_region(®ion, vcpu, reg); + if (ret) + return ret; + + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; + + if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr, + region.klen)) + return -EFAULT; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; @@ -193,9 +448,37 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } -static unsigned long num_core_regs(void) +static int copy_core_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + unsigned int i; + int n = 0; + const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE; + + for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { + /* + * The KVM_REG_ARM64_SVE regs must be used instead of + * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on + * SVE-enabled vcpus: + */ + if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(i)) + continue; + + if (uindices) { + if (put_user(core_reg | i, uindices)) + return -EFAULT; + uindices++; + } + + n++; + } + + return n; +} + +static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) { - return sizeof(struct kvm_regs) / sizeof(__u32); + return copy_core_reg_indices(vcpu, NULL); } /** @@ -251,6 +534,67 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; } +static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) +{ + const unsigned int slices = vcpu_sve_slices(vcpu); + + if (!vcpu_has_sve(vcpu)) + return 0; + + /* Policed by KVM_GET_REG_LIST: */ + WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + + return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + + 1; /* KVM_REG_ARM64_SVE_VLS */ +} + +static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, + u64 __user *uindices) +{ + const unsigned int slices = vcpu_sve_slices(vcpu); + u64 reg; + unsigned int i, n; + int num_regs = 0; + + if (!vcpu_has_sve(vcpu)) + return 0; + + /* Policed by KVM_GET_REG_LIST: */ + WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + + /* + * Enumerate this first, so that userspace can save/restore in + * the order reported by KVM_GET_REG_LIST: + */ + reg = KVM_REG_ARM64_SVE_VLS; + if (put_user(reg, uindices++)) + return -EFAULT; + ++num_regs; + + for (i = 0; i < slices; i++) { + for (n = 0; n < SVE_NUM_ZREGS; n++) { + reg = KVM_REG_ARM64_SVE_ZREG(n, i); + if (put_user(reg, uindices++)) + return -EFAULT; + num_regs++; + } + + for (n = 0; n < SVE_NUM_PREGS; n++) { + reg = KVM_REG_ARM64_SVE_PREG(n, i); + if (put_user(reg, uindices++)) + return -EFAULT; + num_regs++; + } + + reg = KVM_REG_ARM64_SVE_FFR(i); + if (put_user(reg, uindices++)) + return -EFAULT; + num_regs++; + } + + return num_regs; +} + /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG * @@ -258,8 +602,15 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { - return num_core_regs() + kvm_arm_num_sys_reg_descs(vcpu) - + kvm_arm_get_fw_num_regs(vcpu) + NUM_TIMER_REGS; + unsigned long res = 0; + + res += num_core_regs(vcpu); + res += num_sve_regs(vcpu); + res += kvm_arm_num_sys_reg_descs(vcpu); + res += kvm_arm_get_fw_num_regs(vcpu); + res += NUM_TIMER_REGS; + + return res; } /** @@ -269,23 +620,25 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; - const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE; int ret; - for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { - if (put_user(core_reg | i, uindices)) - return -EFAULT; - uindices++; - } + ret = copy_core_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; + + ret = copy_sve_reg_indices(vcpu, uindices); + if (ret < 0) + return ret; + uindices += ret; ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); - if (ret) + if (ret < 0) return ret; uindices += kvm_arm_get_fw_num_regs(vcpu); ret = copy_timer_indices(vcpu, uindices); - if (ret) + if (ret < 0) return ret; uindices += NUM_TIMER_REGS; @@ -298,12 +651,11 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; - /* Register group 16 means we want a core register. */ - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) - return get_core_reg(vcpu, reg); - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) - return kvm_arm_get_fw_reg(vcpu, reg); + switch (reg->id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); + case KVM_REG_ARM_FW: return kvm_arm_get_fw_reg(vcpu, reg); + case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); + } if (is_timer_reg(reg->id)) return get_timer_reg(vcpu, reg); @@ -317,12 +669,11 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; - /* Register group 16 means we set a core register. */ - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) - return set_core_reg(vcpu, reg); - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) - return kvm_arm_set_fw_reg(vcpu, reg); + switch (reg->id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); + case KVM_REG_ARM_FW: return kvm_arm_set_fw_reg(vcpu, reg); + case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); + } if (is_timer_reg(reg->id)) return set_timer_reg(vcpu, reg); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 0b7983442071..516aead3c2a9 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -173,20 +173,40 @@ static int handle_sve(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +#define __ptrauth_save_key(regs, key) \ +({ \ + regs[key ## KEYLO_EL1] = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ + regs[key ## KEYHI_EL1] = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ +}) + +/* + * Handle the guest trying to use a ptrauth instruction, or trying to access a + * ptrauth register. + */ +void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *ctxt; + + if (vcpu_has_ptrauth(vcpu)) { + vcpu_ptrauth_enable(vcpu); + ctxt = vcpu->arch.host_cpu_context; + __ptrauth_save_key(ctxt->sys_regs, APIA); + __ptrauth_save_key(ctxt->sys_regs, APIB); + __ptrauth_save_key(ctxt->sys_regs, APDA); + __ptrauth_save_key(ctxt->sys_regs, APDB); + __ptrauth_save_key(ctxt->sys_regs, APGA); + } else { + kvm_inject_undefined(vcpu); + } +} + /* * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into * a NOP). */ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu, struct kvm_run *run) { - /* - * We don't currently support ptrauth in a guest, and we mask the ID - * registers to prevent well-behaved guests from trying to make use of - * it. - * - * Inject an UNDEF, as if the feature really isn't present. - */ - kvm_inject_undefined(vcpu); + kvm_arm_vcpu_ptrauth_trap(vcpu); return 1; } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 675fdc186e3b..93ba3d7ef027 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -24,6 +24,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_ptrauth.h> #define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x) #define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x) @@ -64,6 +65,13 @@ ENTRY(__guest_enter) add x18, x0, #VCPU_CONTEXT + // Macro ptrauth_switch_to_guest format: + // ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3) + // The below macro to restore guest keys is not implemented in C code + // as it may cause Pointer Authentication key signing mismatch errors + // when this feature is enabled for kernel code. + ptrauth_switch_to_guest x18, x0, x1, x2 + // Restore guest regs x0-x17 ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] @@ -118,6 +126,13 @@ ENTRY(__guest_exit) get_host_ctxt x2, x3 + // Macro ptrauth_switch_to_guest format: + // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3) + // The below macro to save/restore keys is not implemented in C code + // as it may cause Pointer Authentication key signing mismatch errors + // when this feature is enabled for kernel code. + ptrauth_switch_to_host x1, x2, x3, x4, x5 + // Now restore the host regs restore_callee_saved_regs x2 diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 3563fe655cd5..22b4c335e0b2 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -100,7 +100,10 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_EL1_TTA; val &= ~CPACR_EL1_ZEN; - if (!update_fp_enabled(vcpu)) { + if (update_fp_enabled(vcpu)) { + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } else { val &= ~CPACR_EL1_FPEN; __activate_traps_fpsimd32(vcpu); } @@ -317,16 +320,48 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) +/* Check for an FPSIMD/SVE trap and handle as appropriate */ +static bool __hyp_text __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) { - struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; + bool vhe, sve_guest, sve_host; + u8 hsr_ec; - if (has_vhe()) - write_sysreg(read_sysreg(cpacr_el1) | CPACR_EL1_FPEN, - cpacr_el1); - else + if (!system_supports_fpsimd()) + return false; + + if (system_supports_sve()) { + sve_guest = vcpu_has_sve(vcpu); + sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE; + vhe = true; + } else { + sve_guest = false; + sve_host = false; + vhe = has_vhe(); + } + + hsr_ec = kvm_vcpu_trap_get_class(vcpu); + if (hsr_ec != ESR_ELx_EC_FP_ASIMD && + hsr_ec != ESR_ELx_EC_SVE) + return false; + + /* Don't handle SVE traps for non-SVE vcpus here: */ + if (!sve_guest) + if (hsr_ec != ESR_ELx_EC_FP_ASIMD) + return false; + + /* Valid trap. Switch the context: */ + + if (vhe) { + u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN; + + if (sve_guest) + reg |= CPACR_EL1_ZEN; + + write_sysreg(reg, cpacr_el1); + } else { write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP, cptr_el2); + } isb(); @@ -335,21 +370,28 @@ static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) * In the SVE case, VHE is assumed: it is enforced by * Kconfig and kvm_arch_init(). */ - if (system_supports_sve() && - (vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE)) { + if (sve_host) { struct thread_struct *thread = container_of( - host_fpsimd, + vcpu->arch.host_fpsimd_state, struct thread_struct, uw.fpsimd_state); - sve_save_state(sve_pffr(thread), &host_fpsimd->fpsr); + sve_save_state(sve_pffr(thread), + &vcpu->arch.host_fpsimd_state->fpsr); } else { - __fpsimd_save_state(host_fpsimd); + __fpsimd_save_state(vcpu->arch.host_fpsimd_state); } vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; } - __fpsimd_restore_state(&vcpu->arch.ctxt.gp_regs.fp_regs); + if (sve_guest) { + sve_load_state(vcpu_sve_pffr(vcpu), + &vcpu->arch.ctxt.gp_regs.fp_regs.fpsr, + sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1); + write_sysreg_s(vcpu->arch.ctxt.sys_regs[ZCR_EL1], SYS_ZCR_EL12); + } else { + __fpsimd_restore_state(&vcpu->arch.ctxt.gp_regs.fp_regs); + } /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) @@ -385,10 +427,10 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) * and restore the guest context lazily. * If FP/SIMD is not implemented, handle the trap and inject an * undefined instruction exception to the guest. + * Similarly for trapped SVE accesses. */ - if (system_supports_fpsimd() && - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_FP_ASIMD) - return __hyp_switch_fpsimd(vcpu); + if (__hyp_handle_fpsimd(vcpu)) + return true; if (!__populate_fault_info(vcpu)) return true; @@ -524,6 +566,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; + bool pmu_switch_needed; u64 exit_code; /* @@ -543,6 +586,8 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; + pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); + __sysreg_save_state_nvhe(host_ctxt); __activate_vm(kern_hyp_va(vcpu->kvm)); @@ -589,6 +634,9 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) */ __debug_switch_to_host(vcpu); + if (pmu_switch_needed) + __pmu_switch_to_host(host_ctxt); + /* Returning to host will clear PSR.I, remask PMR if needed */ if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQOFF); diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c new file mode 100644 index 000000000000..3da94a5bb6b7 --- /dev/null +++ b/arch/arm64/kvm/pmu.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Arm Limited + * Author: Andrew Murray <Andrew.Murray@arm.com> + */ +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include <asm/kvm_hyp.h> + +/* + * Given the perf event attributes and system type, determine + * if we are going to need to switch counters at guest entry/exit. + */ +static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) +{ + /** + * With VHE the guest kernel runs at EL1 and the host at EL2, + * where user (EL0) is excluded then we have no reason to switch + * counters. + */ + if (has_vhe() && attr->exclude_user) + return false; + + /* Only switch if attributes are different */ + return (attr->exclude_host != attr->exclude_guest); +} + +/* + * Add events to track that we may want to switch at guest entry/exit + * time. + */ +void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) +{ + struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + + if (!kvm_pmu_switch_needed(attr)) + return; + + if (!attr->exclude_host) + ctx->pmu_events.events_host |= set; + if (!attr->exclude_guest) + ctx->pmu_events.events_guest |= set; +} + +/* + * Stop tracking events + */ +void kvm_clr_pmu_events(u32 clr) +{ + struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + + ctx->pmu_events.events_host &= ~clr; + ctx->pmu_events.events_guest &= ~clr; +} + +/** + * Disable host events, enable guest events + */ +bool __hyp_text __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt) +{ + struct kvm_host_data *host; + struct kvm_pmu_events *pmu; + + host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); + pmu = &host->pmu_events; + + if (pmu->events_host) + write_sysreg(pmu->events_host, pmcntenclr_el0); + + if (pmu->events_guest) + write_sysreg(pmu->events_guest, pmcntenset_el0); + + return (pmu->events_host || pmu->events_guest); +} + +/** + * Disable guest events, enable host events + */ +void __hyp_text __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) +{ + struct kvm_host_data *host; + struct kvm_pmu_events *pmu; + + host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); + pmu = &host->pmu_events; + + if (pmu->events_guest) + write_sysreg(pmu->events_guest, pmcntenclr_el0); + + if (pmu->events_host) + write_sysreg(pmu->events_host, pmcntenset_el0); +} + +#define PMEVTYPER_READ_CASE(idx) \ + case idx: \ + return read_sysreg(pmevtyper##idx##_el0) + +#define PMEVTYPER_WRITE_CASE(idx) \ + case idx: \ + write_sysreg(val, pmevtyper##idx##_el0); \ + break + +#define PMEVTYPER_CASES(readwrite) \ + PMEVTYPER_##readwrite##_CASE(0); \ + PMEVTYPER_##readwrite##_CASE(1); \ + PMEVTYPER_##readwrite##_CASE(2); \ + PMEVTYPER_##readwrite##_CASE(3); \ + PMEVTYPER_##readwrite##_CASE(4); \ + PMEVTYPER_##readwrite##_CASE(5); \ + PMEVTYPER_##readwrite##_CASE(6); \ + PMEVTYPER_##readwrite##_CASE(7); \ + PMEVTYPER_##readwrite##_CASE(8); \ + PMEVTYPER_##readwrite##_CASE(9); \ + PMEVTYPER_##readwrite##_CASE(10); \ + PMEVTYPER_##readwrite##_CASE(11); \ + PMEVTYPER_##readwrite##_CASE(12); \ + PMEVTYPER_##readwrite##_CASE(13); \ + PMEVTYPER_##readwrite##_CASE(14); \ + PMEVTYPER_##readwrite##_CASE(15); \ + PMEVTYPER_##readwrite##_CASE(16); \ + PMEVTYPER_##readwrite##_CASE(17); \ + PMEVTYPER_##readwrite##_CASE(18); \ + PMEVTYPER_##readwrite##_CASE(19); \ + PMEVTYPER_##readwrite##_CASE(20); \ + PMEVTYPER_##readwrite##_CASE(21); \ + PMEVTYPER_##readwrite##_CASE(22); \ + PMEVTYPER_##readwrite##_CASE(23); \ + PMEVTYPER_##readwrite##_CASE(24); \ + PMEVTYPER_##readwrite##_CASE(25); \ + PMEVTYPER_##readwrite##_CASE(26); \ + PMEVTYPER_##readwrite##_CASE(27); \ + PMEVTYPER_##readwrite##_CASE(28); \ + PMEVTYPER_##readwrite##_CASE(29); \ + PMEVTYPER_##readwrite##_CASE(30) + +/* + * Read a value direct from PMEVTYPER<idx> where idx is 0-30 + * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + */ +static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) +{ + switch (idx) { + PMEVTYPER_CASES(READ); + case ARMV8_PMU_CYCLE_IDX: + return read_sysreg(pmccfiltr_el0); + default: + WARN_ON(1); + } + + return 0; +} + +/* + * Write a value direct to PMEVTYPER<idx> where idx is 0-30 + * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + */ +static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) +{ + switch (idx) { + PMEVTYPER_CASES(WRITE); + case ARMV8_PMU_CYCLE_IDX: + write_sysreg(val, pmccfiltr_el0); + break; + default: + WARN_ON(1); + } +} + +/* + * Modify ARMv8 PMU events to include EL0 counting + */ +static void kvm_vcpu_pmu_enable_el0(unsigned long events) +{ + u64 typer; + u32 counter; + + for_each_set_bit(counter, &events, 32) { + typer = kvm_vcpu_pmu_read_evtype_direct(counter); + typer &= ~ARMV8_PMU_EXCLUDE_EL0; + kvm_vcpu_pmu_write_evtype_direct(counter, typer); + } +} + +/* + * Modify ARMv8 PMU events to exclude EL0 counting + */ +static void kvm_vcpu_pmu_disable_el0(unsigned long events) +{ + u64 typer; + u32 counter; + + for_each_set_bit(counter, &events, 32) { + typer = kvm_vcpu_pmu_read_evtype_direct(counter); + typer |= ARMV8_PMU_EXCLUDE_EL0; + kvm_vcpu_pmu_write_evtype_direct(counter, typer); + } +} + +/* + * On VHE ensure that only guest events have EL0 counting enabled + */ +void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_host_data *host; + u32 events_guest, events_host; + + if (!has_vhe()) + return; + + host_ctxt = vcpu->arch.host_cpu_context; + host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); + events_guest = host->pmu_events.events_guest; + events_host = host->pmu_events.events_host; + + kvm_vcpu_pmu_enable_el0(events_guest); + kvm_vcpu_pmu_disable_el0(events_host); +} + +/* + * On VHE ensure that only host events have EL0 counting enabled + */ +void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *host_ctxt; + struct kvm_host_data *host; + u32 events_guest, events_host; + + if (!has_vhe()) + return; + + host_ctxt = vcpu->arch.host_cpu_context; + host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); + events_guest = host->pmu_events.events_guest; + events_host = host->pmu_events.events_host; + + kvm_vcpu_pmu_enable_el0(events_host); + kvm_vcpu_pmu_disable_el0(events_guest); +} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e2a0500cd7a2..1140b4485575 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -20,20 +20,26 @@ */ #include <linux/errno.h> +#include <linux/kernel.h> #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/hw_breakpoint.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/types.h> #include <kvm/arm_arch_timer.h> #include <asm/cpufeature.h> #include <asm/cputype.h> +#include <asm/fpsimd.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_coproc.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/virt.h> /* Maximum phys_shift supported for any VM on this host */ static u32 kvm_ipa_limit; @@ -92,6 +98,14 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_VM_IPA_SIZE: r = kvm_ipa_limit; break; + case KVM_CAP_ARM_SVE: + r = system_supports_sve(); + break; + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + case KVM_CAP_ARM_PTRAUTH_GENERIC: + r = has_vhe() && system_supports_address_auth() && + system_supports_generic_auth(); + break; default: r = 0; } @@ -99,13 +113,148 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } +unsigned int kvm_sve_max_vl; + +int kvm_arm_init_sve(void) +{ + if (system_supports_sve()) { + kvm_sve_max_vl = sve_max_virtualisable_vl; + + /* + * The get_sve_reg()/set_sve_reg() ioctl interface will need + * to be extended with multiple register slice support in + * order to support vector lengths greater than + * SVE_VL_ARCH_MAX: + */ + if (WARN_ON(kvm_sve_max_vl > SVE_VL_ARCH_MAX)) + kvm_sve_max_vl = SVE_VL_ARCH_MAX; + + /* + * Don't even try to make use of vector lengths that + * aren't available on all CPUs, for now: + */ + if (kvm_sve_max_vl < sve_max_vl) + pr_warn("KVM: SVE vector length for guests limited to %u bytes\n", + kvm_sve_max_vl); + } + + return 0; +} + +static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) +{ + if (!system_supports_sve()) + return -EINVAL; + + /* Verify that KVM startup enforced this when SVE was detected: */ + if (WARN_ON(!has_vhe())) + return -EINVAL; + + vcpu->arch.sve_max_vl = kvm_sve_max_vl; + + /* + * Userspace can still customize the vector lengths by writing + * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until + * kvm_arm_vcpu_finalize(), which freezes the configuration. + */ + vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE; + + return 0; +} + +/* + * Finalize vcpu's maximum SVE vector length, allocating + * vcpu->arch.sve_state as necessary. + */ +static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) +{ + void *buf; + unsigned int vl; + + vl = vcpu->arch.sve_max_vl; + + /* + * Resposibility for these properties is shared between + * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and + * set_sve_vls(). Double-check here just to be sure: + */ + if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl || + vl > SVE_VL_ARCH_MAX)) + return -EIO; + + buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + vcpu->arch.sve_state = buf; + vcpu->arch.flags |= KVM_ARM64_VCPU_SVE_FINALIZED; + return 0; +} + +int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) +{ + switch (feature) { + case KVM_ARM_VCPU_SVE: + if (!vcpu_has_sve(vcpu)) + return -EINVAL; + + if (kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; + + return kvm_vcpu_finalize_sve(vcpu); + } + + return -EINVAL; +} + +bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) + return false; + + return true; +} + +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kfree(vcpu->arch.sve_state); +} + +static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_sve(vcpu)) + memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); +} + +static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +{ + /* Support ptrauth only if the system supports these capabilities. */ + if (!has_vhe()) + return -EINVAL; + + if (!system_supports_address_auth() || + !system_supports_generic_auth()) + return -EINVAL; + /* + * For now make sure that both address/generic pointer authentication + * features are requested by the userspace together. + */ + if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || + !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) + return -EINVAL; + + vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_PTRAUTH; + return 0; +} + /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on * the virtual CPU struct to their architecturally defined reset - * values. + * values, except for registers whose reset is deferred until + * kvm_arm_vcpu_finalize(). * * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT * ioctl or as part of handling a request issued by another VCPU in the PSCI @@ -131,6 +280,22 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) if (loaded) kvm_arch_vcpu_put(vcpu); + if (!kvm_arm_vcpu_sve_finalized(vcpu)) { + if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) { + ret = kvm_vcpu_enable_sve(vcpu); + if (ret) + goto out; + } + } else { + kvm_vcpu_reset_sve(vcpu); + } + + if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || + test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) { + if (kvm_vcpu_enable_ptrauth(vcpu)) + goto out; + } + switch (vcpu->arch.target) { default: if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 539feecda5b8..857b226bcdde 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -695,6 +695,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, val |= p->regval & ARMV8_PMU_PMCR_MASK; __vcpu_sys_reg(vcpu, PMCR_EL0) = val; kvm_pmu_handle_pmcr(vcpu, val); + kvm_vcpu_pmu_restore_guest(vcpu); } else { /* PMCR.P & PMCR.C are RAZ */ val = __vcpu_sys_reg(vcpu, PMCR_EL0) @@ -850,6 +851,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) { kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK; + kvm_vcpu_pmu_restore_guest(vcpu); } else { p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK; } @@ -875,6 +877,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; kvm_pmu_enable_counter(vcpu, val); + kvm_vcpu_pmu_restore_guest(vcpu); } else { /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; @@ -1007,6 +1010,37 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } +static bool trap_ptrauth(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + kvm_arm_vcpu_ptrauth_trap(vcpu); + + /* + * Return false for both cases as we never skip the trapped + * instruction: + * + * - Either we re-execute the same key register access instruction + * after enabling ptrauth. + * - Or an UNDEF is injected as ptrauth is not supported/enabled. + */ + return false; +} + +static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN_USER | REG_HIDDEN_GUEST; +} + +#define __PTRAUTH_KEY(k) \ + { SYS_DESC(SYS_## k), trap_ptrauth, reset_unknown, k, \ + .visibility = ptrauth_visibility} + +#define PTRAUTH_KEY(k) \ + __PTRAUTH_KEY(k ## KEYLO_EL1), \ + __PTRAUTH_KEY(k ## KEYHI_EL1) + static bool access_arch_timer(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1044,25 +1078,20 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(struct sys_reg_desc const *r, bool raz) +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r, bool raz) { u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); u64 val = raz ? 0 : read_sanitised_ftr_reg(id); - if (id == SYS_ID_AA64PFR0_EL1) { - if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT)) - kvm_debug("SVE unsupported for guests, suppressing\n"); - + if (id == SYS_ID_AA64PFR0_EL1 && !vcpu_has_sve(vcpu)) { val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); - } else if (id == SYS_ID_AA64ISAR1_EL1) { - const u64 ptrauth_mask = (0xfUL << ID_AA64ISAR1_APA_SHIFT) | - (0xfUL << ID_AA64ISAR1_API_SHIFT) | - (0xfUL << ID_AA64ISAR1_GPA_SHIFT) | - (0xfUL << ID_AA64ISAR1_GPI_SHIFT); - if (val & ptrauth_mask) - kvm_debug("ptrauth unsupported for guests, suppressing\n"); - val &= ~ptrauth_mask; + } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) { + val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) | + (0xfUL << ID_AA64ISAR1_API_SHIFT) | + (0xfUL << ID_AA64ISAR1_GPA_SHIFT) | + (0xfUL << ID_AA64ISAR1_GPI_SHIFT)); } return val; @@ -1078,7 +1107,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_id_reg(r, raz); + p->regval = read_id_reg(vcpu, r, raz); return true; } @@ -1100,6 +1129,81 @@ static int reg_from_user(u64 *val, const void __user *uaddr, u64 id); static int reg_to_user(void __user *uaddr, const u64 *val, u64 id); static u64 sys_reg_to_index(const struct sys_reg_desc *reg); +/* Visibility overrides for SVE-specific control registers */ +static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_sve(vcpu)) + return 0; + + return REG_HIDDEN_USER | REG_HIDDEN_GUEST; +} + +/* Visibility overrides for SVE-specific ID registers */ +static unsigned int sve_id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_sve(vcpu)) + return 0; + + return REG_HIDDEN_USER; +} + +/* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */ +static u64 guest_id_aa64zfr0_el1(const struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_sve(vcpu)) + return 0; + + return read_sanitised_ftr_reg(SYS_ID_AA64ZFR0_EL1); +} + +static bool access_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, rd); + + p->regval = guest_id_aa64zfr0_el1(vcpu); + return true; +} + +static int get_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + u64 val; + + if (WARN_ON(!vcpu_has_sve(vcpu))) + return -ENOENT; + + val = guest_id_aa64zfr0_el1(vcpu); + return reg_to_user(uaddr, &val, reg->id); +} + +static int set_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + const u64 id = sys_reg_to_index(rd); + int err; + u64 val; + + if (WARN_ON(!vcpu_has_sve(vcpu))) + return -ENOENT; + + err = reg_from_user(&val, uaddr, id); + if (err) + return err; + + /* This is what we mean by invariant: you can't change it. */ + if (val != guest_id_aa64zfr0_el1(vcpu)) + return -EINVAL; + + return 0; +} + /* * cpufeature ID register user accessors * @@ -1107,16 +1211,18 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, +static int __get_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(rd, raz); + const u64 val = read_id_reg(vcpu, rd, raz); return reg_to_user(uaddr, &val, id); } -static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, +static int __set_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); @@ -1128,7 +1234,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, return err; /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(rd, raz)) + if (val != read_id_reg(vcpu, rd, raz)) return -EINVAL; return 0; @@ -1137,25 +1243,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, false); + return __get_id_reg(vcpu, rd, uaddr, false); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, false); + return __set_id_reg(vcpu, rd, uaddr, false); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, true); + return __get_id_reg(vcpu, rd, uaddr, true); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, true); + return __set_id_reg(vcpu, rd, uaddr, true); } static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1343,7 +1449,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_SANITISED(ID_AA64PFR1_EL1), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), - ID_UNALLOCATED(4,4), + { SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, .visibility = sve_id_visibility }, ID_UNALLOCATED(4,5), ID_UNALLOCATED(4,6), ID_UNALLOCATED(4,7), @@ -1380,10 +1486,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, + { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, + PTRAUTH_KEY(APIA), + PTRAUTH_KEY(APIB), + PTRAUTH_KEY(APDA), + PTRAUTH_KEY(APDB), + PTRAUTH_KEY(APGA), + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -1924,6 +2037,12 @@ static void perform_access(struct kvm_vcpu *vcpu, { trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + /* Check for regs disabled by runtime config */ + if (sysreg_hidden_from_guest(vcpu, r)) { + kvm_inject_undefined(vcpu); + return; + } + /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies @@ -2435,6 +2554,10 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return get_invariant_sys_reg(reg->id, uaddr); + /* Check for regs disabled by runtime config */ + if (sysreg_hidden_from_user(vcpu, r)) + return -ENOENT; + if (r->get_user) return (r->get_user)(vcpu, r, reg, uaddr); @@ -2456,6 +2579,10 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return set_invariant_sys_reg(reg->id, uaddr); + /* Check for regs disabled by runtime config */ + if (sysreg_hidden_from_user(vcpu, r)) + return -ENOENT; + if (r->set_user) return (r->set_user)(vcpu, r, reg, uaddr); @@ -2512,7 +2639,8 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) return true; } -static int walk_one_sys_reg(const struct sys_reg_desc *rd, +static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { @@ -2523,6 +2651,9 @@ static int walk_one_sys_reg(const struct sys_reg_desc *rd, if (!(rd->reg || rd->get_user)) return 0; + if (sysreg_hidden_from_user(vcpu, rd)) + return 0; + if (!copy_reg_to_user(rd, uind)) return -EFAULT; @@ -2551,9 +2682,9 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) int cmp = cmp_sys_reg(i1, i2); /* target-specific overrides generic entry. */ if (cmp <= 0) - err = walk_one_sys_reg(i1, &uind, &total); + err = walk_one_sys_reg(vcpu, i1, &uind, &total); else - err = walk_one_sys_reg(i2, &uind, &total); + err = walk_one_sys_reg(vcpu, i2, &uind, &total); if (err) return err; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 3b1bc7f01d0b..2be99508dcb9 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -64,8 +64,15 @@ struct sys_reg_desc { const struct kvm_one_reg *reg, void __user *uaddr); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr); + + /* Return mask of REG_* runtime visibility overrides */ + unsigned int (*visibility)(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd); }; +#define REG_HIDDEN_USER (1 << 0) /* hidden from userspace ioctls */ +#define REG_HIDDEN_GUEST (1 << 1) /* hidden from guest */ + static inline void print_sys_reg_instr(const struct sys_reg_params *p) { /* Look, we even formatted it for you to paste into the table! */ @@ -102,6 +109,24 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r __vcpu_sys_reg(vcpu, r->reg) = r->val; } +static inline bool sysreg_hidden_from_guest(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (likely(!r->visibility)) + return false; + + return r->visibility(vcpu, r) & REG_HIDDEN_GUEST; +} + +static inline bool sysreg_hidden_from_user(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (likely(!r->visibility)) + return false; + + return r->visibility(vcpu, r) & REG_HIDDEN_USER; +} + static inline int cmp_sys_reg(const struct sys_reg_desc *i1, const struct sys_reg_desc *i2) { diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 6b168d32fbff..2162eb32dcec 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -30,7 +30,6 @@ generic-y += pci.h generic-y += percpu.h generic-y += pgalloc.h generic-y += preempt.h -generic-y += segment.h generic-y += serial.h generic-y += shmparam.h generic-y += tlbflush.h diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 61c01db6c292..ecfc4b4b6373 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -23,6 +23,7 @@ config H8300 select HAVE_ARCH_KGDB select HAVE_ARCH_HASH select CPU_NO_EFFICIENT_FFS + select UACCESS_MEMCPY config CPU_BIG_ENDIAN def_bool y diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index f2e22058e488..79cd1e605ec4 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -47,6 +47,7 @@ generic-y += timex.h generic-y += tlbflush.h generic-y += topology.h generic-y += trace_clock.h +generic-y += uaccess.h generic-y += unaligned.h generic-y += vga.h generic-y += word-at-a-time.h diff --git a/arch/h8300/include/asm/uaccess.h b/arch/h8300/include/asm/uaccess.h deleted file mode 100644 index bc8031949d07..000000000000 --- a/arch/h8300/include/asm/uaccess.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_UACCESS_H -#define _ASM_UACCESS_H - -#include <linux/string.h> - -static inline __must_check unsigned long -raw_copy_from_user(void *to, const void __user * from, unsigned long n) -{ - if (__builtin_constant_p(n)) { - switch(n) { - case 1: - *(u8 *)to = *(u8 __force *)from; - return 0; - case 2: - *(u16 *)to = *(u16 __force *)from; - return 0; - case 4: - *(u32 *)to = *(u32 __force *)from; - return 0; - } - } - - memcpy(to, (const void __force *)from, n); - return 0; -} - -static inline __must_check unsigned long -raw_copy_to_user(void __user *to, const void *from, unsigned long n) -{ - if (__builtin_constant_p(n)) { - switch(n) { - case 1: - *(u8 __force *)to = *(u8 *)from; - return 0; - case 2: - *(u16 __force *)to = *(u16 *)from; - return 0; - case 4: - *(u32 __force *)to = *(u32 *)from; - return 0; - default: - break; - } - } - - memcpy((void __force *)to, from, n); - return 0; -} -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER - -#include <asm-generic/uaccess.h> - -#endif diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index b32bfa1fe99e..23a979a85f14 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/delay.h> #include <linux/interrupt.h> +#include <linux/io.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/console.h> diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 4a3d72f76ea2..84bb1ed1b931 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -29,7 +29,6 @@ generic-y += pci.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h -generic-y += segment.h generic-y += serial.h generic-y += shmparam.h generic-y += topology.h diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h index a30e58d5f351..7a34092e8b58 100644 --- a/arch/hexagon/include/asm/uaccess.h +++ b/arch/hexagon/include/asm/uaccess.h @@ -24,7 +24,6 @@ * User space memory access functions */ #include <linux/mm.h> -#include <asm/segment.h> #include <asm/sections.h> /* diff --git a/arch/ia64/include/asm/segment.h b/arch/ia64/include/asm/segment.h deleted file mode 100644 index b89e2b3d648f..000000000000 --- a/arch/ia64/include/asm/segment.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_IA64_SEGMENT_H -#define _ASM_IA64_SEGMENT_H - -/* Only here because we have some old header files that expect it.. */ - -#endif /* _ASM_IA64_SEGMENT_H */ diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 56e3d0b685e1..e01df3f2f80d 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -348,3 +348,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index df4ec3ec71d1..7e3d0734b2f3 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -427,3 +427,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 4964947732af..26339e417695 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -433,3 +433,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 677e5bfeff47..70d3200476bf 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -674,7 +674,10 @@ config SGI_IP27 select SYS_HAS_EARLY_PRINTK select HAVE_PCI select IRQ_MIPS_CPU + select IRQ_DOMAIN_HIERARCHY select NR_CPUS_DEFAULT_64 + select PCI_DRIVERS_GENERIC + select PCI_XTALK_BRIDGE select SYS_HAS_CPU_R10000 select SYS_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN @@ -1241,6 +1244,9 @@ config IRQ_GT641XX config PCI_GT64XXX_PCI0 bool +config PCI_XTALK_BRIDGE + bool + config NO_EXCEPT_FILL bool diff --git a/arch/mips/alchemy/common/platform.c b/arch/mips/alchemy/common/platform.c index 1454d9f6ab2d..b8f3397c59c9 100644 --- a/arch/mips/alchemy/common/platform.c +++ b/arch/mips/alchemy/common/platform.c @@ -131,9 +131,7 @@ static void __init alchemy_setup_uarts(int ctype) } -/* The dmamask must be set for OHCI/EHCI to work */ -static u64 alchemy_ohci_dmamask = DMA_BIT_MASK(32); -static u64 __maybe_unused alchemy_ehci_dmamask = DMA_BIT_MASK(32); +static u64 alchemy_all_dmamask = DMA_BIT_MASK(32); /* Power on callback for the ehci platform driver */ static int alchemy_ehci_power_on(struct platform_device *pdev) @@ -231,7 +229,7 @@ static void __init alchemy_setup_usb(int ctype) res[1].flags = IORESOURCE_IRQ; pdev->name = "ohci-platform"; pdev->id = 0; - pdev->dev.dma_mask = &alchemy_ohci_dmamask; + pdev->dev.dma_mask = &alchemy_all_dmamask; pdev->dev.platform_data = &alchemy_ohci_pdata; if (platform_device_register(pdev)) @@ -251,7 +249,7 @@ static void __init alchemy_setup_usb(int ctype) res[1].flags = IORESOURCE_IRQ; pdev->name = "ehci-platform"; pdev->id = 0; - pdev->dev.dma_mask = &alchemy_ehci_dmamask; + pdev->dev.dma_mask = &alchemy_all_dmamask; pdev->dev.platform_data = &alchemy_ehci_pdata; if (platform_device_register(pdev)) @@ -271,7 +269,7 @@ static void __init alchemy_setup_usb(int ctype) res[1].flags = IORESOURCE_IRQ; pdev->name = "ohci-platform"; pdev->id = 1; - pdev->dev.dma_mask = &alchemy_ohci_dmamask; + pdev->dev.dma_mask = &alchemy_all_dmamask; pdev->dev.platform_data = &alchemy_ohci_pdata; if (platform_device_register(pdev)) @@ -338,7 +336,11 @@ static struct platform_device au1xxx_eth0_device = { .name = "au1000-eth", .id = 0, .num_resources = MAC_RES_COUNT, - .dev.platform_data = &au1xxx_eth0_platform_data, + .dev = { + .dma_mask = &alchemy_all_dmamask, + .coherent_dma_mask = DMA_BIT_MASK(32), + .platform_data = &au1xxx_eth0_platform_data, + }, }; static struct resource au1xxx_eth1_resources[][MAC_RES_COUNT] __initdata = { @@ -370,7 +372,11 @@ static struct platform_device au1xxx_eth1_device = { .name = "au1000-eth", .id = 1, .num_resources = MAC_RES_COUNT, - .dev.platform_data = &au1xxx_eth1_platform_data, + .dev = { + .dma_mask = &alchemy_all_dmamask, + .coherent_dma_mask = DMA_BIT_MASK(32), + .platform_data = &au1xxx_eth1_platform_data, + }, }; void __init au1xxx_override_eth_cfg(unsigned int port, diff --git a/arch/mips/ath79/clock.c b/arch/mips/ath79/clock.c index d4ca97e2ec6c..228cdc736db7 100644 --- a/arch/mips/ath79/clock.c +++ b/arch/mips/ath79/clock.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/err.h> #include <linux/clk.h> #include <linux/clkdev.h> diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 25a57895a3a3..298b46b4e9cb 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -14,6 +14,7 @@ #include <linux/kernel.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/clk.h> diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig index ff40fbc2f439..21a1168ae301 100644 --- a/arch/mips/configs/ip22_defconfig +++ b/arch/mips/configs/ip22_defconfig @@ -228,7 +228,7 @@ CONFIG_SERIAL_IP22_ZILOG=m # CONFIG_HW_RANDOM is not set CONFIG_RAW_DRIVER=m # CONFIG_HWMON is not set -CONFIG_THERMAL=m +CONFIG_THERMAL=y CONFIG_WATCHDOG=y CONFIG_INDYDOG=m # CONFIG_VGA_CONSOLE is not set diff --git a/arch/mips/configs/ip27_defconfig b/arch/mips/configs/ip27_defconfig index 81c47e18131b..54db5dedf776 100644 --- a/arch/mips/configs/ip27_defconfig +++ b/arch/mips/configs/ip27_defconfig @@ -271,7 +271,7 @@ CONFIG_I2C_PARPORT_LIGHT=m CONFIG_I2C_TAOS_EVM=m CONFIG_I2C_STUB=m # CONFIG_HWMON is not set -CONFIG_THERMAL=m +CONFIG_THERMAL=y CONFIG_MFD_PCF50633=m CONFIG_PCF50633_ADC=m CONFIG_PCF50633_GPIO=m diff --git a/arch/mips/generic/init.c b/arch/mips/generic/init.c index a106f8113842..a84475f1924f 100644 --- a/arch/mips/generic/init.c +++ b/arch/mips/generic/init.c @@ -43,14 +43,14 @@ void __init *plat_get_fdt(void) /* Already set up */ return (void *)fdt; - if ((fw_arg0 == -2) && !fdt_check_header((void *)fw_arg1)) { + if ((fw_arg0 == -2) && !fdt_check_header((void *)fw_passed_dtb)) { /* * We booted using the UHI boot protocol, so we have been * provided with the appropriate device tree for the board. * Make use of it & search for any machine struct based upon * the root compatible string. */ - fdt = (void *)fw_arg1; + fdt = (void *)fw_passed_dtb; for_each_mips_machine(check_mach) { match = mips_machine_is_compatible(check_mach, fdt); diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 87b86cdf126a..a03cd4e24f37 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -19,7 +19,6 @@ generic-y += preempt.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += sections.h -generic-y += segment.h generic-y += trace_clock.h generic-y += unaligned.h generic-y += user.h diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 42ea1313626c..965f0793a5f9 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -7,18 +7,9 @@ #include <asm/mmzone.h> struct cpuinfo_ip27 { -// cpuid_t p_cpuid; /* PROM assigned cpuid */ cnodeid_t p_nodeid; /* my node ID in compact-id-space */ nasid_t p_nasid; /* my node ID in numa-as-id-space */ unsigned char p_slice; /* Physical position on node board */ -#if 0 - unsigned long loops_per_sec; - unsigned long ipi_count; - unsigned long irq_attempt[NR_IRQS]; - unsigned long smp_local_irq_count; - unsigned long prof_multiplier; - unsigned long prof_counter; -#endif }; extern struct cpuinfo_ip27 sn_cpu_info[NR_CPUS]; @@ -30,7 +21,7 @@ extern struct cpuinfo_ip27 sn_cpu_info[NR_CPUS]; struct pci_bus; extern int pcibus_to_node(struct pci_bus *); -#define cpumask_of_pcibus(bus) (cpu_online_mask) +#define cpumask_of_pcibus(bus) (cpumask_of_node(pcibus_to_node(bus))) extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; diff --git a/arch/mips/include/asm/pci/bridge.h b/arch/mips/include/asm/pci/bridge.h index 23574c27eb40..a92cd30b48c9 100644 --- a/arch/mips/include/asm/pci/bridge.h +++ b/arch/mips/include/asm/pci/bridge.h @@ -801,15 +801,13 @@ struct bridge_err_cmdword { #define PCI64_ATTR_RMF_SHFT 48 struct bridge_controller { - struct pci_controller pc; - struct resource mem; - struct resource io; struct resource busn; struct bridge_regs *base; - nasid_t nasid; - unsigned int widget_id; - u64 baddr; + unsigned long baddr; + unsigned long intr_addr; + struct irq_domain *domain; unsigned int pci_int[8]; + nasid_t nasid; }; #define BRIDGE_CONTROLLER(bus) \ @@ -822,8 +820,4 @@ struct bridge_controller { #define bridge_clr(bc, reg, val) \ __raw_writel(__raw_readl(&bc->base->reg) & ~(val), &bc->base->reg) -extern int request_bridge_irq(struct bridge_controller *bc, int pin); - -extern struct pci_ops bridge_pci_ops; - #endif /* _ASM_PCI_BRIDGE_H */ diff --git a/arch/mips/include/asm/sn/irq_alloc.h b/arch/mips/include/asm/sn/irq_alloc.h new file mode 100644 index 000000000000..09b89cecff56 --- /dev/null +++ b/arch/mips/include/asm/sn/irq_alloc.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SN_IRQ_ALLOC_H +#define __ASM_SN_IRQ_ALLOC_H + +struct irq_alloc_info { + void *ctrl; + nasid_t nasid; + int pin; +}; + +#endif /* __ASM_SN_IRQ_ALLOC_H */ diff --git a/arch/mips/include/asm/xtalk/xtalk.h b/arch/mips/include/asm/xtalk/xtalk.h index 26d2ed1fa917..680e7efebbaf 100644 --- a/arch/mips/include/asm/xtalk/xtalk.h +++ b/arch/mips/include/asm/xtalk/xtalk.h @@ -47,15 +47,6 @@ typedef struct xtalk_piomap_s *xtalk_piomap_t; #define XIO_PORT(x) ((xwidgetnum_t)(((x)&XIO_PORT_BITS) >> XIO_PORT_SHIFT)) #define XIO_PACK(p, o) ((((uint64_t)(p))<<XIO_PORT_SHIFT) | ((o)&XIO_ADDR_BITS)) -#ifdef CONFIG_PCI -extern int bridge_probe(nasid_t nasid, int widget, int masterwid); -#else -static inline int bridge_probe(nasid_t nasid, int widget, int masterwid) -{ - return 0; -} -#endif - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_XTALK_XTALK_H */ diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index d5e335e6846a..6126b77d5a62 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -1973,6 +1973,14 @@ static inline void cpu_probe_ingenic(struct cpuinfo_mips *c, unsigned int cpu) panic("Unknown Ingenic Processor ID!"); break; } + + /* + * The config0 register in the Xburst CPUs with a processor ID of + * PRID_COMP_INGENIC_D0 report themselves as MIPS32r2 compatible, + * but they don't actually support this ISA. + */ + if ((c->processor_id & PRID_COMP_MASK) == PRID_COMP_INGENIC_D0) + c->isa_level &= ~MIPS_CPU_ISA_M32R2; } static inline void cpu_probe_netlogic(struct cpuinfo_mips *c, int cpu) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 413863508f6f..d67fb64e908c 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -64,17 +64,11 @@ struct mips_perf_event { #define CNTR_EVEN 0x55555555 #define CNTR_ODD 0xaaaaaaaa #define CNTR_ALL 0xffffffff -#ifdef CONFIG_MIPS_MT_SMP enum { T = 0, V = 1, P = 2, } range; -#else - #define T - #define V - #define P -#endif }; static struct mips_perf_event raw_event; @@ -325,9 +319,7 @@ static void mipsxx_pmu_enable_event(struct hw_perf_event *evt, int idx) { struct perf_event *event = container_of(evt, struct perf_event, hw); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); -#ifdef CONFIG_MIPS_MT_SMP unsigned int range = evt->event_base >> 24; -#endif /* CONFIG_MIPS_MT_SMP */ WARN_ON(idx < 0 || idx >= mipspmu.num_counters); @@ -336,21 +328,15 @@ static void mipsxx_pmu_enable_event(struct hw_perf_event *evt, int idx) /* Make sure interrupt enabled. */ MIPS_PERFCTRL_IE; -#ifdef CONFIG_CPU_BMIPS5000 - { + if (IS_ENABLED(CONFIG_CPU_BMIPS5000)) { /* enable the counter for the calling thread */ cpuc->saved_ctrl[idx] |= (1 << (12 + vpe_id())) | BRCM_PERFCTRL_TC; - } -#else -#ifdef CONFIG_MIPS_MT_SMP - if (range > V) { + } else if (IS_ENABLED(CONFIG_MIPS_MT_SMP) && range > V) { /* The counter is processor wide. Set it up to count all TCs. */ pr_debug("Enabling perf counter for all TCs\n"); cpuc->saved_ctrl[idx] |= M_TC_EN_ALL; - } else -#endif /* CONFIG_MIPS_MT_SMP */ - { + } else { unsigned int cpu, ctrl; /* @@ -365,7 +351,6 @@ static void mipsxx_pmu_enable_event(struct hw_perf_event *evt, int idx) cpuc->saved_ctrl[idx] |= ctrl; pr_debug("Enabling perf counter for CPU%d\n", cpu); } -#endif /* CONFIG_CPU_BMIPS5000 */ /* * We do not actually let the counter run. Leave it until start(). */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 9392dfe33f97..0e2dd68ade57 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -366,3 +366,9 @@ 425 n32 io_uring_setup sys_io_uring_setup 426 n32 io_uring_enter sys_io_uring_enter 427 n32 io_uring_register sys_io_uring_register +428 n32 open_tree sys_open_tree +429 n32 move_mount sys_move_mount +430 n32 fsopen sys_fsopen +431 n32 fsconfig sys_fsconfig +432 n32 fsmount sys_fsmount +433 n32 fspick sys_fspick diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index cd0c8aa21fba..5eebfa0d155c 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -342,3 +342,9 @@ 425 n64 io_uring_setup sys_io_uring_setup 426 n64 io_uring_enter sys_io_uring_enter 427 n64 io_uring_register sys_io_uring_register +428 n64 open_tree sys_open_tree +429 n64 move_mount sys_move_mount +430 n64 fsopen sys_fsopen +431 n64 fsconfig sys_fsconfig +432 n64 fsmount sys_fsmount +433 n64 fspick sys_fspick diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index e849e8ffe4a2..3cc1374e02d0 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -415,3 +415,9 @@ 425 o32 io_uring_setup sys_io_uring_setup 426 o32 io_uring_enter sys_io_uring_enter 427 o32 io_uring_register sys_io_uring_register +428 o32 open_tree sys_open_tree +429 o32 move_mount sys_move_mount +430 o32 fsopen sys_fsopen +431 o32 fsconfig sys_fsconfig +432 o32 fsmount sys_fsmount +433 o32 fspick sys_fspick diff --git a/arch/mips/pci/Makefile b/arch/mips/pci/Makefile index c4f976593061..d6de4cb2e31c 100644 --- a/arch/mips/pci/Makefile +++ b/arch/mips/pci/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_PCI_AR2315) += pci-ar2315.o obj-$(CONFIG_SOC_AR71XX) += pci-ar71xx.o obj-$(CONFIG_PCI_AR724X) += pci-ar724x.o obj-$(CONFIG_MIPS_PCI_VIRTIO) += pci-virtio-guest.o +obj-$(CONFIG_PCI_XTALK_BRIDGE) += pci-xtalk-bridge.o # # These are still pretty much in the old state, watch, go blind. # @@ -39,7 +40,7 @@ obj-$(CONFIG_MIPS_MALTA) += fixup-malta.o pci-malta.o obj-$(CONFIG_PMC_MSP7120_GW) += fixup-pmcmsp.o ops-pmcmsp.o obj-$(CONFIG_PMC_MSP7120_EVAL) += fixup-pmcmsp.o ops-pmcmsp.o obj-$(CONFIG_PMC_MSP7120_FPGA) += fixup-pmcmsp.o ops-pmcmsp.o -obj-$(CONFIG_SGI_IP27) += ops-bridge.o pci-ip27.o +obj-$(CONFIG_SGI_IP27) += pci-ip27.o obj-$(CONFIG_SGI_IP32) += fixup-ip32.o ops-mace.o pci-ip32.o obj-$(CONFIG_SIBYTE_SB1250) += fixup-sb1250.o pci-sb1250.o obj-$(CONFIG_SIBYTE_BCM112X) += fixup-sb1250.o pci-sb1250.o diff --git a/arch/mips/pci/ops-bridge.c b/arch/mips/pci/ops-bridge.c deleted file mode 100644 index df95b0da08f2..000000000000 --- a/arch/mips/pci/ops-bridge.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1999, 2000, 04, 06 Ralf Baechle (ralf@linux-mips.org) - * Copyright (C) 1999, 2000 Silicon Graphics, Inc. - */ -#include <linux/pci.h> -#include <asm/paccess.h> -#include <asm/pci/bridge.h> -#include <asm/sn/arch.h> -#include <asm/sn/intr.h> -#include <asm/sn/sn0/hub.h> - -/* - * Most of the IOC3 PCI config register aren't present - * we emulate what is needed for a normal PCI enumeration - */ -static u32 emulate_ioc3_cfg(int where, int size) -{ - if (size == 1 && where == 0x3d) - return 0x01; - else if (size == 2 && where == 0x3c) - return 0x0100; - else if (size == 4 && where == 0x3c) - return 0x00000100; - - return 0; -} - -/* - * The Bridge ASIC supports both type 0 and type 1 access. Type 1 is - * not really documented, so right now I can't write code which uses it. - * Therefore we use type 0 accesses for now even though they won't work - * correctly for PCI-to-PCI bridges. - * - * The function is complicated by the ultimate brokenness of the IOC3 chip - * which is used in SGI systems. The IOC3 can only handle 32-bit PCI - * accesses and does only decode parts of it's address space. - */ - -static int pci_conf0_read_config(struct pci_bus *bus, unsigned int devfn, - int where, int size, u32 * value) -{ - struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); - struct bridge_regs *bridge = bc->base; - int slot = PCI_SLOT(devfn); - int fn = PCI_FUNC(devfn); - volatile void *addr; - u32 cf, shift, mask; - int res; - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[PCI_VENDOR_ID]; - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - /* - * IOC3 is broken beyond belief ... Don't even give the - * generic PCI code a chance to look at it for real ... - */ - if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) - goto is_ioc3; - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[where ^ (4 - size)]; - - if (size == 1) - res = get_dbe(*value, (u8 *) addr); - else if (size == 2) - res = get_dbe(*value, (u16 *) addr); - else - res = get_dbe(*value, (u32 *) addr); - - return res ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; - -is_ioc3: - - /* - * IOC3 special handling - */ - if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) { - *value = emulate_ioc3_cfg(where, size); - return PCIBIOS_SUCCESSFUL; - } - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].l[where >> 2]; - - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - shift = ((where & 3) << 3); - mask = (0xffffffffU >> ((4 - size) << 3)); - *value = (cf >> shift) & mask; - - return PCIBIOS_SUCCESSFUL; -} - -static int pci_conf1_read_config(struct pci_bus *bus, unsigned int devfn, - int where, int size, u32 * value) -{ - struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); - struct bridge_regs *bridge = bc->base; - int busno = bus->number; - int slot = PCI_SLOT(devfn); - int fn = PCI_FUNC(devfn); - volatile void *addr; - u32 cf, shift, mask; - int res; - - bridge_write(bc, b_pci_cfg, (busno << 16) | (slot << 11)); - addr = &bridge->b_type1_cfg.c[(fn << 8) | PCI_VENDOR_ID]; - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - /* - * IOC3 is broken beyond belief ... Don't even give the - * generic PCI code a chance to look at it for real ... - */ - if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) - goto is_ioc3; - - bridge_write(bc, b_pci_cfg, (busno << 16) | (slot << 11)); - addr = &bridge->b_type1_cfg.c[(fn << 8) | (where ^ (4 - size))]; - - if (size == 1) - res = get_dbe(*value, (u8 *) addr); - else if (size == 2) - res = get_dbe(*value, (u16 *) addr); - else - res = get_dbe(*value, (u32 *) addr); - - return res ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; - -is_ioc3: - - /* - * IOC3 special handling - */ - if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) { - *value = emulate_ioc3_cfg(where, size); - return PCIBIOS_SUCCESSFUL; - } - - bridge_write(bc, b_pci_cfg, (busno << 16) | (slot << 11)); - addr = &bridge->b_type1_cfg.c[(fn << 8) | where]; - - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - shift = ((where & 3) << 3); - mask = (0xffffffffU >> ((4 - size) << 3)); - *value = (cf >> shift) & mask; - - return PCIBIOS_SUCCESSFUL; -} - -static int pci_read_config(struct pci_bus *bus, unsigned int devfn, - int where, int size, u32 * value) -{ - if (!pci_is_root_bus(bus)) - return pci_conf1_read_config(bus, devfn, where, size, value); - - return pci_conf0_read_config(bus, devfn, where, size, value); -} - -static int pci_conf0_write_config(struct pci_bus *bus, unsigned int devfn, - int where, int size, u32 value) -{ - struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); - struct bridge_regs *bridge = bc->base; - int slot = PCI_SLOT(devfn); - int fn = PCI_FUNC(devfn); - volatile void *addr; - u32 cf, shift, mask, smask; - int res; - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[PCI_VENDOR_ID]; - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - /* - * IOC3 is broken beyond belief ... Don't even give the - * generic PCI code a chance to look at it for real ... - */ - if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) - goto is_ioc3; - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[where ^ (4 - size)]; - - if (size == 1) { - res = put_dbe(value, (u8 *) addr); - } else if (size == 2) { - res = put_dbe(value, (u16 *) addr); - } else { - res = put_dbe(value, (u32 *) addr); - } - - if (res) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; - -is_ioc3: - - /* - * IOC3 special handling - */ - if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) - return PCIBIOS_SUCCESSFUL; - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].l[where >> 2]; - - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - shift = ((where & 3) << 3); - mask = (0xffffffffU >> ((4 - size) << 3)); - smask = mask << shift; - - cf = (cf & ~smask) | ((value & mask) << shift); - if (put_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; -} - -static int pci_conf1_write_config(struct pci_bus *bus, unsigned int devfn, - int where, int size, u32 value) -{ - struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); - struct bridge_regs *bridge = bc->base; - int slot = PCI_SLOT(devfn); - int fn = PCI_FUNC(devfn); - int busno = bus->number; - volatile void *addr; - u32 cf, shift, mask, smask; - int res; - - bridge_write(bc, b_pci_cfg, (busno << 16) | (slot << 11)); - addr = &bridge->b_type1_cfg.c[(fn << 8) | PCI_VENDOR_ID]; - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - /* - * IOC3 is broken beyond belief ... Don't even give the - * generic PCI code a chance to look at it for real ... - */ - if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) - goto is_ioc3; - - addr = &bridge->b_type1_cfg.c[(fn << 8) | (where ^ (4 - size))]; - - if (size == 1) { - res = put_dbe(value, (u8 *) addr); - } else if (size == 2) { - res = put_dbe(value, (u16 *) addr); - } else { - res = put_dbe(value, (u32 *) addr); - } - - if (res) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; - -is_ioc3: - - /* - * IOC3 special handling - */ - if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) - return PCIBIOS_SUCCESSFUL; - - addr = &bridge->b_type0_cfg_dev[slot].f[fn].l[where >> 2]; - - if (get_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - shift = ((where & 3) << 3); - mask = (0xffffffffU >> ((4 - size) << 3)); - smask = mask << shift; - - cf = (cf & ~smask) | ((value & mask) << shift); - if (put_dbe(cf, (u32 *) addr)) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; -} - -static int pci_write_config(struct pci_bus *bus, unsigned int devfn, - int where, int size, u32 value) -{ - if (!pci_is_root_bus(bus)) - return pci_conf1_write_config(bus, devfn, where, size, value); - - return pci_conf0_write_config(bus, devfn, where, size, value); -} - -struct pci_ops bridge_pci_ops = { - .read = pci_read_config, - .write = pci_write_config, -}; diff --git a/arch/mips/pci/pci-ip27.c b/arch/mips/pci/pci-ip27.c index 3c177b4d0609..441eb9383b20 100644 --- a/arch/mips/pci/pci-ip27.c +++ b/arch/mips/pci/pci-ip27.c @@ -7,162 +7,7 @@ * Copyright (C) 1999, 2000, 04 Ralf Baechle (ralf@linux-mips.org) * Copyright (C) 1999, 2000 Silicon Graphics, Inc. */ -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/smp.h> -#include <linux/dma-direct.h> -#include <asm/sn/arch.h> #include <asm/pci/bridge.h> -#include <asm/paccess.h> -#include <asm/sn/intr.h> -#include <asm/sn/sn0/hub.h> - -/* - * Max #PCI busses we can handle; ie, max #PCI bridges. - */ -#define MAX_PCI_BUSSES 40 - -/* - * XXX: No kmalloc available when we do our crosstalk scan, - * we should try to move it later in the boot process. - */ -static struct bridge_controller bridges[MAX_PCI_BUSSES]; - -extern struct pci_ops bridge_pci_ops; - -int bridge_probe(nasid_t nasid, int widget_id, int masterwid) -{ - unsigned long offset = NODE_OFFSET(nasid); - struct bridge_controller *bc; - static int num_bridges = 0; - int slot; - - pci_set_flags(PCI_PROBE_ONLY); - - printk("a bridge\n"); - - /* XXX: kludge alert.. */ - if (!num_bridges) - ioport_resource.end = ~0UL; - - bc = &bridges[num_bridges]; - - bc->pc.pci_ops = &bridge_pci_ops; - bc->pc.mem_resource = &bc->mem; - bc->pc.io_resource = &bc->io; - - bc->pc.index = num_bridges; - - bc->mem.name = "Bridge PCI MEM"; - bc->pc.mem_offset = offset; - bc->mem.start = 0; - bc->mem.end = ~0UL; - bc->mem.flags = IORESOURCE_MEM; - - bc->io.name = "Bridge IO MEM"; - bc->pc.io_offset = offset; - bc->io.start = 0UL; - bc->io.end = ~0UL; - bc->io.flags = IORESOURCE_IO; - - bc->widget_id = widget_id; - bc->nasid = nasid; - - bc->baddr = (u64)masterwid << 60 | PCI64_ATTR_BAR; - - /* - * point to this bridge - */ - bc->base = (struct bridge_regs *)RAW_NODE_SWIN_BASE(nasid, widget_id); - - /* - * Clear all pending interrupts. - */ - bridge_write(bc, b_int_rst_stat, BRIDGE_IRR_ALL_CLR); - - /* - * Until otherwise set up, assume all interrupts are from slot 0 - */ - bridge_write(bc, b_int_device, 0x0); - - /* - * swap pio's to pci mem and io space (big windows) - */ - bridge_set(bc, b_wid_control, BRIDGE_CTRL_IO_SWAP | - BRIDGE_CTRL_MEM_SWAP); -#ifdef CONFIG_PAGE_SIZE_4KB - bridge_clr(bc, b_wid_control, BRIDGE_CTRL_PAGE_SIZE); -#else /* 16kB or larger */ - bridge_set(bc, b_wid_control, BRIDGE_CTRL_PAGE_SIZE); -#endif - - /* - * Hmm... IRIX sets additional bits in the address which - * are documented as reserved in the bridge docs. - */ - bridge_write(bc, b_wid_int_upper, 0x8000 | (masterwid << 16)); - bridge_write(bc, b_wid_int_lower, 0x01800090); /* PI_INT_PEND_MOD off*/ - bridge_write(bc, b_dir_map, (masterwid << 20)); /* DMA */ - bridge_write(bc, b_int_enable, 0); - - for (slot = 0; slot < 8; slot ++) { - bridge_set(bc, b_device[slot].reg, BRIDGE_DEV_SWAP_DIR); - bc->pci_int[slot] = -1; - } - bridge_read(bc, b_wid_tflush); /* wait until Bridge PIO complete */ - - register_pci_controller(&bc->pc); - - num_bridges++; - - return 0; -} - -/* - * All observed requests have pin == 1. We could have a global here, that - * gets incremented and returned every time - unfortunately, pci_map_irq - * may be called on the same device over and over, and need to return the - * same value. On O2000, pin can be 0 or 1, and PCI slots can be [0..7]. - * - * A given PCI device, in general, should be able to intr any of the cpus - * on any one of the hubs connected to its xbow. - */ -int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) -{ - return 0; -} - -static inline struct pci_dev *bridge_root_dev(struct pci_dev *dev) -{ - while (dev->bus->parent) { - /* Move up the chain of bridges. */ - dev = dev->bus->self; - } - - return dev; -} - -/* Do platform specific device initialization at pci_enable_device() time */ -int pcibios_plat_dev_init(struct pci_dev *dev) -{ - struct bridge_controller *bc = BRIDGE_CONTROLLER(dev->bus); - struct pci_dev *rdev = bridge_root_dev(dev); - int slot = PCI_SLOT(rdev->devfn); - int irq; - - irq = bc->pci_int[slot]; - if (irq == -1) { - irq = request_bridge_irq(bc, slot); - if (irq < 0) - return irq; - - bc->pci_int[slot] = irq; - } - dev->irq = irq; - - return 0; -} dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { @@ -177,29 +22,6 @@ phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) return dma_addr & ~(0xffUL << 56); } -/* - * Device might live on a subordinate PCI bus. XXX Walk up the chain of buses - * to find the slot number in sense of the bridge device register. - * XXX This also means multiple devices might rely on conflicting bridge - * settings. - */ - -static inline void pci_disable_swapping(struct pci_dev *dev) -{ - struct bridge_controller *bc = BRIDGE_CONTROLLER(dev->bus); - struct bridge_regs *bridge = bc->base; - int slot = PCI_SLOT(dev->devfn); - - /* Turn off byte swapping */ - bridge->b_device[slot].reg &= ~BRIDGE_DEV_SWAP_DIR; - bridge->b_widget.w_tflush; /* Flush */ -} - -static void pci_fixup_ioc3(struct pci_dev *d) -{ - pci_disable_swapping(d); -} - #ifdef CONFIG_NUMA int pcibus_to_node(struct pci_bus *bus) { @@ -209,6 +31,3 @@ int pcibus_to_node(struct pci_bus *bus) } EXPORT_SYMBOL(pcibus_to_node); #endif /* CONFIG_NUMA */ - -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SGI, PCI_DEVICE_ID_SGI_IOC3, - pci_fixup_ioc3); diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c new file mode 100644 index 000000000000..bcf7f559789a --- /dev/null +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -0,0 +1,610 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2003 Christoph Hellwig (hch@lst.de) + * Copyright (C) 1999, 2000, 04 Ralf Baechle (ralf@linux-mips.org) + * Copyright (C) 1999, 2000 Silicon Graphics, Inc. + */ +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/pci.h> +#include <linux/smp.h> +#include <linux/dma-direct.h> +#include <linux/platform_device.h> +#include <linux/platform_data/xtalk-bridge.h> + +#include <asm/pci/bridge.h> +#include <asm/paccess.h> +#include <asm/sn/irq_alloc.h> + +/* + * Most of the IOC3 PCI config register aren't present + * we emulate what is needed for a normal PCI enumeration + */ +static u32 emulate_ioc3_cfg(int where, int size) +{ + if (size == 1 && where == 0x3d) + return 0x01; + else if (size == 2 && where == 0x3c) + return 0x0100; + else if (size == 4 && where == 0x3c) + return 0x00000100; + + return 0; +} + +static void bridge_disable_swapping(struct pci_dev *dev) +{ + struct bridge_controller *bc = BRIDGE_CONTROLLER(dev->bus); + int slot = PCI_SLOT(dev->devfn); + + /* Turn off byte swapping */ + bridge_clr(bc, b_device[slot].reg, BRIDGE_DEV_SWAP_DIR); + bridge_read(bc, b_widget.w_tflush); /* Flush */ +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SGI, PCI_DEVICE_ID_SGI_IOC3, + bridge_disable_swapping); + + +/* + * The Bridge ASIC supports both type 0 and type 1 access. Type 1 is + * not really documented, so right now I can't write code which uses it. + * Therefore we use type 0 accesses for now even though they won't work + * correctly for PCI-to-PCI bridges. + * + * The function is complicated by the ultimate brokenness of the IOC3 chip + * which is used in SGI systems. The IOC3 can only handle 32-bit PCI + * accesses and does only decode parts of it's address space. + */ +static int pci_conf0_read_config(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *value) +{ + struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct bridge_regs *bridge = bc->base; + int slot = PCI_SLOT(devfn); + int fn = PCI_FUNC(devfn); + void *addr; + u32 cf, shift, mask; + int res; + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[PCI_VENDOR_ID]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + /* + * IOC3 is broken beyond belief ... Don't even give the + * generic PCI code a chance to look at it for real ... + */ + if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) + goto is_ioc3; + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[where ^ (4 - size)]; + + if (size == 1) + res = get_dbe(*value, (u8 *)addr); + else if (size == 2) + res = get_dbe(*value, (u16 *)addr); + else + res = get_dbe(*value, (u32 *)addr); + + return res ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; + +is_ioc3: + + /* + * IOC3 special handling + */ + if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) { + *value = emulate_ioc3_cfg(where, size); + return PCIBIOS_SUCCESSFUL; + } + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].l[where >> 2]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + shift = ((where & 3) << 3); + mask = (0xffffffffU >> ((4 - size) << 3)); + *value = (cf >> shift) & mask; + + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_read_config(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *value) +{ + struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct bridge_regs *bridge = bc->base; + int busno = bus->number; + int slot = PCI_SLOT(devfn); + int fn = PCI_FUNC(devfn); + void *addr; + u32 cf, shift, mask; + int res; + + bridge_write(bc, b_pci_cfg, (busno << 16) | (slot << 11)); + addr = &bridge->b_type1_cfg.c[(fn << 8) | PCI_VENDOR_ID]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + /* + * IOC3 is broken beyond belief ... Don't even give the + * generic PCI code a chance to look at it for real ... + */ + if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) + goto is_ioc3; + + addr = &bridge->b_type1_cfg.c[(fn << 8) | (where ^ (4 - size))]; + + if (size == 1) + res = get_dbe(*value, (u8 *)addr); + else if (size == 2) + res = get_dbe(*value, (u16 *)addr); + else + res = get_dbe(*value, (u32 *)addr); + + return res ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; + +is_ioc3: + + /* + * IOC3 special handling + */ + if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) { + *value = emulate_ioc3_cfg(where, size); + return PCIBIOS_SUCCESSFUL; + } + + addr = &bridge->b_type1_cfg.c[(fn << 8) | where]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + shift = ((where & 3) << 3); + mask = (0xffffffffU >> ((4 - size) << 3)); + *value = (cf >> shift) & mask; + + return PCIBIOS_SUCCESSFUL; +} + +static int pci_read_config(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *value) +{ + if (!pci_is_root_bus(bus)) + return pci_conf1_read_config(bus, devfn, where, size, value); + + return pci_conf0_read_config(bus, devfn, where, size, value); +} + +static int pci_conf0_write_config(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 value) +{ + struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct bridge_regs *bridge = bc->base; + int slot = PCI_SLOT(devfn); + int fn = PCI_FUNC(devfn); + void *addr; + u32 cf, shift, mask, smask; + int res; + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[PCI_VENDOR_ID]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + /* + * IOC3 is broken beyond belief ... Don't even give the + * generic PCI code a chance to look at it for real ... + */ + if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) + goto is_ioc3; + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].c[where ^ (4 - size)]; + + if (size == 1) + res = put_dbe(value, (u8 *)addr); + else if (size == 2) + res = put_dbe(value, (u16 *)addr); + else + res = put_dbe(value, (u32 *)addr); + + if (res) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; + +is_ioc3: + + /* + * IOC3 special handling + */ + if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) + return PCIBIOS_SUCCESSFUL; + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].l[where >> 2]; + + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + shift = ((where & 3) << 3); + mask = (0xffffffffU >> ((4 - size) << 3)); + smask = mask << shift; + + cf = (cf & ~smask) | ((value & mask) << shift); + if (put_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int pci_conf1_write_config(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 value) +{ + struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct bridge_regs *bridge = bc->base; + int slot = PCI_SLOT(devfn); + int fn = PCI_FUNC(devfn); + int busno = bus->number; + void *addr; + u32 cf, shift, mask, smask; + int res; + + bridge_write(bc, b_pci_cfg, (busno << 16) | (slot << 11)); + addr = &bridge->b_type1_cfg.c[(fn << 8) | PCI_VENDOR_ID]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + /* + * IOC3 is broken beyond belief ... Don't even give the + * generic PCI code a chance to look at it for real ... + */ + if (cf == (PCI_VENDOR_ID_SGI | (PCI_DEVICE_ID_SGI_IOC3 << 16))) + goto is_ioc3; + + addr = &bridge->b_type1_cfg.c[(fn << 8) | (where ^ (4 - size))]; + + if (size == 1) + res = put_dbe(value, (u8 *)addr); + else if (size == 2) + res = put_dbe(value, (u16 *)addr); + else + res = put_dbe(value, (u32 *)addr); + + if (res) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; + +is_ioc3: + + /* + * IOC3 special handling + */ + if ((where >= 0x14 && where < 0x40) || (where >= 0x48)) + return PCIBIOS_SUCCESSFUL; + + addr = &bridge->b_type0_cfg_dev[slot].f[fn].l[where >> 2]; + if (get_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + shift = ((where & 3) << 3); + mask = (0xffffffffU >> ((4 - size) << 3)); + smask = mask << shift; + + cf = (cf & ~smask) | ((value & mask) << shift); + if (put_dbe(cf, (u32 *)addr)) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int pci_write_config(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 value) +{ + if (!pci_is_root_bus(bus)) + return pci_conf1_write_config(bus, devfn, where, size, value); + + return pci_conf0_write_config(bus, devfn, where, size, value); +} + +static struct pci_ops bridge_pci_ops = { + .read = pci_read_config, + .write = pci_write_config, +}; + +struct bridge_irq_chip_data { + struct bridge_controller *bc; + nasid_t nasid; +}; + +static int bridge_set_affinity(struct irq_data *d, const struct cpumask *mask, + bool force) +{ +#ifdef CONFIG_NUMA + struct bridge_irq_chip_data *data = d->chip_data; + int bit = d->parent_data->hwirq; + int pin = d->hwirq; + nasid_t nasid; + int ret, cpu; + + ret = irq_chip_set_affinity_parent(d, mask, force); + if (ret >= 0) { + cpu = cpumask_first_and(mask, cpu_online_mask); + nasid = COMPACT_TO_NASID_NODEID(cpu_to_node(cpu)); + bridge_write(data->bc, b_int_addr[pin].addr, + (((data->bc->intr_addr >> 30) & 0x30000) | + bit | (nasid << 8))); + bridge_read(data->bc, b_wid_tflush); + } + return ret; +#else + return irq_chip_set_affinity_parent(d, mask, force); +#endif +} + +struct irq_chip bridge_irq_chip = { + .name = "BRIDGE", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_set_affinity = bridge_set_affinity +}; + +static int bridge_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct bridge_irq_chip_data *data; + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info) + return -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + data->bc = info->ctrl; + data->nasid = info->nasid; + irq_domain_set_info(domain, virq, info->pin, &bridge_irq_chip, + data, handle_level_irq, NULL, NULL); + } else { + kfree(data); + } + + return ret; +} + +static void bridge_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irqd = irq_domain_get_irq_data(domain, virq); + + if (nr_irqs) + return; + + kfree(irqd->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static int bridge_domain_activate(struct irq_domain *domain, + struct irq_data *irqd, bool reserve) +{ + struct bridge_irq_chip_data *data = irqd->chip_data; + struct bridge_controller *bc = data->bc; + int bit = irqd->parent_data->hwirq; + int pin = irqd->hwirq; + u32 device; + + bridge_write(bc, b_int_addr[pin].addr, + (((bc->intr_addr >> 30) & 0x30000) | + bit | (data->nasid << 8))); + bridge_set(bc, b_int_enable, (1 << pin)); + bridge_set(bc, b_int_enable, 0x7ffffe00); /* more stuff in int_enable */ + + /* + * Enable sending of an interrupt clear packt to the hub on a high to + * low transition of the interrupt pin. + * + * IRIX sets additional bits in the address which are documented as + * reserved in the bridge docs. + */ + bridge_set(bc, b_int_mode, (1UL << pin)); + + /* + * We assume the bridge to have a 1:1 mapping between devices + * (slots) and intr pins. + */ + device = bridge_read(bc, b_int_device); + device &= ~(7 << (pin*3)); + device |= (pin << (pin*3)); + bridge_write(bc, b_int_device, device); + + bridge_read(bc, b_wid_tflush); + return 0; +} + +static void bridge_domain_deactivate(struct irq_domain *domain, + struct irq_data *irqd) +{ + struct bridge_irq_chip_data *data = irqd->chip_data; + + bridge_clr(data->bc, b_int_enable, (1 << irqd->hwirq)); + bridge_read(data->bc, b_wid_tflush); +} + +static const struct irq_domain_ops bridge_domain_ops = { + .alloc = bridge_domain_alloc, + .free = bridge_domain_free, + .activate = bridge_domain_activate, + .deactivate = bridge_domain_deactivate +}; + +/* + * All observed requests have pin == 1. We could have a global here, that + * gets incremented and returned every time - unfortunately, pci_map_irq + * may be called on the same device over and over, and need to return the + * same value. On O2000, pin can be 0 or 1, and PCI slots can be [0..7]. + * + * A given PCI device, in general, should be able to intr any of the cpus + * on any one of the hubs connected to its xbow. + */ +static int bridge_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +{ + struct bridge_controller *bc = BRIDGE_CONTROLLER(dev->bus); + struct irq_alloc_info info; + int irq; + + irq = bc->pci_int[slot]; + if (irq == -1) { + info.ctrl = bc; + info.nasid = bc->nasid; + info.pin = slot; + + irq = irq_domain_alloc_irqs(bc->domain, 1, bc->nasid, &info); + if (irq < 0) + return irq; + + bc->pci_int[slot] = irq; + } + return irq; +} + +static int bridge_probe(struct platform_device *pdev) +{ + struct xtalk_bridge_platform_data *bd = dev_get_platdata(&pdev->dev); + struct device *dev = &pdev->dev; + struct bridge_controller *bc; + struct pci_host_bridge *host; + struct irq_domain *domain, *parent; + struct fwnode_handle *fn; + int slot; + int err; + + parent = irq_get_default_host(); + if (!parent) + return -ENODEV; + fn = irq_domain_alloc_named_fwnode("BRIDGE"); + if (!fn) + return -ENOMEM; + domain = irq_domain_create_hierarchy(parent, 0, 8, fn, + &bridge_domain_ops, NULL); + irq_domain_free_fwnode(fn); + if (!domain) + return -ENOMEM; + + pci_set_flags(PCI_PROBE_ONLY); + + host = devm_pci_alloc_host_bridge(dev, sizeof(*bc)); + if (!host) { + err = -ENOMEM; + goto err_remove_domain; + } + + bc = pci_host_bridge_priv(host); + + bc->busn.name = "Bridge PCI busn"; + bc->busn.start = 0; + bc->busn.end = 0xff; + bc->busn.flags = IORESOURCE_BUS; + + bc->domain = domain; + + pci_add_resource_offset(&host->windows, &bd->mem, bd->mem_offset); + pci_add_resource_offset(&host->windows, &bd->io, bd->io_offset); + pci_add_resource(&host->windows, &bc->busn); + + err = devm_request_pci_bus_resources(dev, &host->windows); + if (err < 0) + goto err_free_resource; + + bc->nasid = bd->nasid; + + bc->baddr = (u64)bd->masterwid << 60 | PCI64_ATTR_BAR; + bc->base = (struct bridge_regs *)bd->bridge_addr; + bc->intr_addr = bd->intr_addr; + + /* + * Clear all pending interrupts. + */ + bridge_write(bc, b_int_rst_stat, BRIDGE_IRR_ALL_CLR); + + /* + * Until otherwise set up, assume all interrupts are from slot 0 + */ + bridge_write(bc, b_int_device, 0x0); + + /* + * disable swapping for big windows + */ + bridge_clr(bc, b_wid_control, + BRIDGE_CTRL_IO_SWAP | BRIDGE_CTRL_MEM_SWAP); +#ifdef CONFIG_PAGE_SIZE_4KB + bridge_clr(bc, b_wid_control, BRIDGE_CTRL_PAGE_SIZE); +#else /* 16kB or larger */ + bridge_set(bc, b_wid_control, BRIDGE_CTRL_PAGE_SIZE); +#endif + + /* + * Hmm... IRIX sets additional bits in the address which + * are documented as reserved in the bridge docs. + */ + bridge_write(bc, b_wid_int_upper, + ((bc->intr_addr >> 32) & 0xffff) | (bd->masterwid << 16)); + bridge_write(bc, b_wid_int_lower, bc->intr_addr & 0xffffffff); + bridge_write(bc, b_dir_map, (bd->masterwid << 20)); /* DMA */ + bridge_write(bc, b_int_enable, 0); + + for (slot = 0; slot < 8; slot++) { + bridge_set(bc, b_device[slot].reg, BRIDGE_DEV_SWAP_DIR); + bc->pci_int[slot] = -1; + } + bridge_read(bc, b_wid_tflush); /* wait until Bridge PIO complete */ + + host->dev.parent = dev; + host->sysdata = bc; + host->busnr = 0; + host->ops = &bridge_pci_ops; + host->map_irq = bridge_map_irq; + host->swizzle_irq = pci_common_swizzle; + + err = pci_scan_root_bus_bridge(host); + if (err < 0) + goto err_free_resource; + + pci_bus_claim_resources(host->bus); + pci_bus_add_devices(host->bus); + + platform_set_drvdata(pdev, host->bus); + + return 0; + +err_free_resource: + pci_free_resource_list(&host->windows); +err_remove_domain: + irq_domain_remove(domain); + return err; +} + +static int bridge_remove(struct platform_device *pdev) +{ + struct pci_bus *bus = platform_get_drvdata(pdev); + struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + + irq_domain_remove(bc->domain); + pci_lock_rescan_remove(); + pci_stop_root_bus(bus); + pci_remove_root_bus(bus); + pci_unlock_rescan_remove(); + + return 0; +} + +static struct platform_driver bridge_driver = { + .probe = bridge_probe, + .remove = bridge_remove, + .driver = { + .name = "xtalk-bridge", + } +}; + +builtin_platform_driver(bridge_driver); diff --git a/arch/mips/sgi-ip22/ip22-platform.c b/arch/mips/sgi-ip22/ip22-platform.c index 37ad26716579..0b2002e02a47 100644 --- a/arch/mips/sgi-ip22/ip22-platform.c +++ b/arch/mips/sgi-ip22/ip22-platform.c @@ -3,6 +3,7 @@ #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/platform_device.h> +#include <linux/dma-mapping.h> #include <asm/paccess.h> #include <asm/sgi/ip22.h> @@ -25,6 +26,8 @@ static struct sgiwd93_platform_data sgiwd93_0_pd = { .irq = SGI_WD93_0_IRQ, }; +static u64 sgiwd93_0_dma_mask = DMA_BIT_MASK(32); + static struct platform_device sgiwd93_0_device = { .name = "sgiwd93", .id = 0, @@ -32,6 +35,8 @@ static struct platform_device sgiwd93_0_device = { .resource = sgiwd93_0_resources, .dev = { .platform_data = &sgiwd93_0_pd, + .dma_mask = &sgiwd93_0_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -49,6 +54,8 @@ static struct sgiwd93_platform_data sgiwd93_1_pd = { .irq = SGI_WD93_1_IRQ, }; +static u64 sgiwd93_1_dma_mask = DMA_BIT_MASK(32); + static struct platform_device sgiwd93_1_device = { .name = "sgiwd93", .id = 1, @@ -56,6 +63,8 @@ static struct platform_device sgiwd93_1_device = { .resource = sgiwd93_1_resources, .dev = { .platform_data = &sgiwd93_1_pd, + .dma_mask = &sgiwd93_1_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -96,6 +105,8 @@ static struct resource sgiseeq_0_resources[] = { static struct sgiseeq_platform_data eth0_pd; +static u64 sgiseeq_dma_mask = DMA_BIT_MASK(32); + static struct platform_device eth0_device = { .name = "sgiseeq", .id = 0, @@ -103,6 +114,8 @@ static struct platform_device eth0_device = { .resource = sgiseeq_0_resources, .dev = { .platform_data = ð0_pd, + .dma_mask = &sgiseeq_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; diff --git a/arch/mips/sgi-ip27/ip27-init.c b/arch/mips/sgi-ip27/ip27-init.c index 6074efeff894..066b33f50bcc 100644 --- a/arch/mips/sgi-ip27/ip27-init.c +++ b/arch/mips/sgi-ip27/ip27-init.c @@ -184,5 +184,7 @@ void __init plat_mem_setup(void) ioc3_eth_init(); + ioport_resource.start = 0; + ioport_resource.end = ~0UL; set_io_port_base(IO_BASE); } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index a32f843cdbe0..37be04975831 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -12,22 +12,20 @@ #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/bitops.h> +#include <linux/sched.h> #include <asm/io.h> #include <asm/irq_cpu.h> -#include <asm/pci/bridge.h> #include <asm/sn/addrs.h> #include <asm/sn/agent.h> #include <asm/sn/arch.h> #include <asm/sn/hub.h> #include <asm/sn/intr.h> +#include <asm/sn/irq_alloc.h> struct hub_irq_data { - struct bridge_controller *bc; u64 *irq_mask[2]; cpuid_t cpu; - int bit; - int pin; }; static DECLARE_BITMAP(hub_irq_map, IP27_HUB_IRQ_COUNT); @@ -54,7 +52,7 @@ static void enable_hub_irq(struct irq_data *d) struct hub_irq_data *hd = irq_data_get_irq_chip_data(d); unsigned long *mask = per_cpu(irq_enable_mask, hd->cpu); - set_bit(hd->bit, mask); + set_bit(d->hwirq, mask); __raw_writeq(mask[0], hd->irq_mask[0]); __raw_writeq(mask[1], hd->irq_mask[1]); } @@ -64,71 +62,11 @@ static void disable_hub_irq(struct irq_data *d) struct hub_irq_data *hd = irq_data_get_irq_chip_data(d); unsigned long *mask = per_cpu(irq_enable_mask, hd->cpu); - clear_bit(hd->bit, mask); + clear_bit(d->hwirq, mask); __raw_writeq(mask[0], hd->irq_mask[0]); __raw_writeq(mask[1], hd->irq_mask[1]); } -static unsigned int startup_bridge_irq(struct irq_data *d) -{ - struct hub_irq_data *hd = irq_data_get_irq_chip_data(d); - struct bridge_controller *bc; - nasid_t nasid; - u32 device; - int pin; - - if (!hd) - return -EINVAL; - - pin = hd->pin; - bc = hd->bc; - - nasid = COMPACT_TO_NASID_NODEID(cpu_to_node(hd->cpu)); - bridge_write(bc, b_int_addr[pin].addr, - (0x20000 | hd->bit | (nasid << 8))); - bridge_set(bc, b_int_enable, (1 << pin)); - bridge_set(bc, b_int_enable, 0x7ffffe00); /* more stuff in int_enable */ - - /* - * Enable sending of an interrupt clear packt to the hub on a high to - * low transition of the interrupt pin. - * - * IRIX sets additional bits in the address which are documented as - * reserved in the bridge docs. - */ - bridge_set(bc, b_int_mode, (1UL << pin)); - - /* - * We assume the bridge to have a 1:1 mapping between devices - * (slots) and intr pins. - */ - device = bridge_read(bc, b_int_device); - device &= ~(7 << (pin*3)); - device |= (pin << (pin*3)); - bridge_write(bc, b_int_device, device); - - bridge_read(bc, b_wid_tflush); - - enable_hub_irq(d); - - return 0; /* Never anything pending. */ -} - -static void shutdown_bridge_irq(struct irq_data *d) -{ - struct hub_irq_data *hd = irq_data_get_irq_chip_data(d); - struct bridge_controller *bc; - - if (!hd) - return; - - disable_hub_irq(d); - - bc = hd->bc; - bridge_clr(bc, b_int_enable, (1 << hd->pin)); - bridge_read(bc, b_wid_tflush); -} - static void setup_hub_mask(struct hub_irq_data *hd, const struct cpumask *mask) { nasid_t nasid; @@ -144,9 +82,6 @@ static void setup_hub_mask(struct hub_irq_data *hd, const struct cpumask *mask) hd->irq_mask[0] = REMOTE_HUB_PTR(nasid, PI_INT_MASK0_B); hd->irq_mask[1] = REMOTE_HUB_PTR(nasid, PI_INT_MASK1_B); } - - /* Make sure it's not already pending when we connect it. */ - REMOTE_HUB_CLR_INTR(nasid, hd->bit); } static int set_affinity_hub_irq(struct irq_data *d, const struct cpumask *mask, @@ -163,7 +98,7 @@ static int set_affinity_hub_irq(struct irq_data *d, const struct cpumask *mask, setup_hub_mask(hd, mask); if (irqd_is_started(d)) - startup_bridge_irq(d); + enable_hub_irq(d); irq_data_update_effective_affinity(d, cpumask_of(hd->cpu)); @@ -172,20 +107,22 @@ static int set_affinity_hub_irq(struct irq_data *d, const struct cpumask *mask, static struct irq_chip hub_irq_type = { .name = "HUB", - .irq_startup = startup_bridge_irq, - .irq_shutdown = shutdown_bridge_irq, .irq_mask = disable_hub_irq, .irq_unmask = enable_hub_irq, .irq_set_affinity = set_affinity_hub_irq, }; -int request_bridge_irq(struct bridge_controller *bc, int pin) +static int hub_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { + struct irq_alloc_info *info = arg; struct hub_irq_data *hd; struct hub_data *hub; struct irq_desc *desc; int swlevel; - int irq; + + if (nr_irqs > 1 || !info) + return -EINVAL; hd = kzalloc(sizeof(*hd), GFP_KERNEL); if (!hd) @@ -196,46 +133,41 @@ int request_bridge_irq(struct bridge_controller *bc, int pin) kfree(hd); return -EAGAIN; } - irq = swlevel + IP27_HUB_IRQ_BASE; - - hd->bc = bc; - hd->bit = swlevel; - hd->pin = pin; - irq_set_chip_data(irq, hd); + irq_domain_set_info(domain, virq, swlevel, &hub_irq_type, hd, + handle_level_irq, NULL, NULL); /* use CPU connected to nearest hub */ - hub = hub_data(NASID_TO_COMPACT_NODEID(bc->nasid)); + hub = hub_data(NASID_TO_COMPACT_NODEID(info->nasid)); setup_hub_mask(hd, &hub->h_cpus); - desc = irq_to_desc(irq); - desc->irq_common_data.node = bc->nasid; + /* Make sure it's not already pending when we connect it. */ + REMOTE_HUB_CLR_INTR(info->nasid, swlevel); + + desc = irq_to_desc(virq); + desc->irq_common_data.node = info->nasid; cpumask_copy(desc->irq_common_data.affinity, &hub->h_cpus); - return irq; + return 0; } -void ip27_hub_irq_init(void) +static void hub_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) { - int i; + struct irq_data *irqd; - for (i = IP27_HUB_IRQ_BASE; - i < (IP27_HUB_IRQ_BASE + IP27_HUB_IRQ_COUNT); i++) - irq_set_chip_and_handler(i, &hub_irq_type, handle_level_irq); - - /* - * Some interrupts are reserved by hardware or by software convention. - * Mark these as reserved right away so they won't be used accidentally - * later. - */ - for (i = 0; i <= BASE_PCI_IRQ; i++) - set_bit(i, hub_irq_map); - - set_bit(IP_PEND0_6_63, hub_irq_map); + if (nr_irqs > 1) + return; - for (i = NI_BRDCAST_ERR_A; i <= MSC_PANIC_INTR; i++) - set_bit(i, hub_irq_map); + irqd = irq_domain_get_irq_data(domain, virq); + if (irqd && irqd->chip_data) + kfree(irqd->chip_data); } +static const struct irq_domain_ops hub_domain_ops = { + .alloc = hub_domain_alloc, + .free = hub_domain_free, +}; + /* * This code is unnecessarily complex, because we do * intr enabling. Basically, once we grab the set of intrs we need @@ -252,7 +184,9 @@ static void ip27_do_irq_mask0(struct irq_desc *desc) { cpuid_t cpu = smp_processor_id(); unsigned long *mask = per_cpu(irq_enable_mask, cpu); + struct irq_domain *domain; u64 pend0; + int irq; /* copied from Irix intpend0() */ pend0 = LOCAL_HUB_L(PI_INT_PEND0); @@ -276,7 +210,14 @@ static void ip27_do_irq_mask0(struct irq_desc *desc) generic_smp_call_function_interrupt(); } else #endif - generic_handle_irq(__ffs(pend0) + IP27_HUB_IRQ_BASE); + { + domain = irq_desc_get_handler_data(desc); + irq = irq_linear_revmap(domain, __ffs(pend0)); + if (irq) + generic_handle_irq(irq); + else + spurious_interrupt(); + } LOCAL_HUB_L(PI_INT_PEND0); } @@ -285,7 +226,9 @@ static void ip27_do_irq_mask1(struct irq_desc *desc) { cpuid_t cpu = smp_processor_id(); unsigned long *mask = per_cpu(irq_enable_mask, cpu); + struct irq_domain *domain; u64 pend1; + int irq; /* copied from Irix intpend0() */ pend1 = LOCAL_HUB_L(PI_INT_PEND1); @@ -294,7 +237,12 @@ static void ip27_do_irq_mask1(struct irq_desc *desc) if (!pend1) return; - generic_handle_irq(__ffs(pend1) + IP27_HUB_IRQ_BASE + 64); + domain = irq_desc_get_handler_data(desc); + irq = irq_linear_revmap(domain, __ffs(pend1) + 64); + if (irq) + generic_handle_irq(irq); + else + spurious_interrupt(); LOCAL_HUB_L(PI_INT_PEND1); } @@ -325,11 +273,41 @@ void install_ipi(void) void __init arch_init_irq(void) { + struct irq_domain *domain; + struct fwnode_handle *fn; + int i; + mips_cpu_irq_init(); - ip27_hub_irq_init(); + + /* + * Some interrupts are reserved by hardware or by software convention. + * Mark these as reserved right away so they won't be used accidentally + * later. + */ + for (i = 0; i <= BASE_PCI_IRQ; i++) + set_bit(i, hub_irq_map); + + set_bit(IP_PEND0_6_63, hub_irq_map); + + for (i = NI_BRDCAST_ERR_A; i <= MSC_PANIC_INTR; i++) + set_bit(i, hub_irq_map); + + fn = irq_domain_alloc_named_fwnode("HUB"); + WARN_ON(fn == NULL); + if (!fn) + return; + domain = irq_domain_create_linear(fn, IP27_HUB_IRQ_COUNT, + &hub_domain_ops, NULL); + WARN_ON(domain == NULL); + if (!domain) + return; + + irq_set_default_host(domain); irq_set_percpu_devid(IP27_HUB_PEND0_IRQ); - irq_set_chained_handler(IP27_HUB_PEND0_IRQ, ip27_do_irq_mask0); + irq_set_chained_handler_and_data(IP27_HUB_PEND0_IRQ, ip27_do_irq_mask0, + domain); irq_set_percpu_devid(IP27_HUB_PEND1_IRQ); - irq_set_chained_handler(IP27_HUB_PEND1_IRQ, ip27_do_irq_mask1); + irq_set_chained_handler_and_data(IP27_HUB_PEND1_IRQ, ip27_do_irq_mask1, + domain); } diff --git a/arch/mips/sgi-ip27/ip27-xtalk.c b/arch/mips/sgi-ip27/ip27-xtalk.c index ce06aaa115ae..bd5cb855c6e5 100644 --- a/arch/mips/sgi-ip27/ip27-xtalk.c +++ b/arch/mips/sgi-ip27/ip27-xtalk.c @@ -9,6 +9,9 @@ #include <linux/kernel.h> #include <linux/smp.h> +#include <linux/platform_device.h> +#include <linux/platform_data/xtalk-bridge.h> +#include <asm/sn/addrs.h> #include <asm/sn/types.h> #include <asm/sn/klconfig.h> #include <asm/sn/hub.h> @@ -20,7 +23,48 @@ #define XXBOW_WIDGET_PART_NUM 0xd000 /* Xbow in Xbridge */ #define BASE_XBOW_PORT 8 /* Lowest external port */ -extern int bridge_probe(nasid_t nasid, int widget, int masterwid); +static void bridge_platform_create(nasid_t nasid, int widget, int masterwid) +{ + struct xtalk_bridge_platform_data *bd; + struct platform_device *pdev; + unsigned long offset; + + bd = kzalloc(sizeof(*bd), GFP_KERNEL); + if (!bd) + goto no_mem; + pdev = platform_device_alloc("xtalk-bridge", PLATFORM_DEVID_AUTO); + if (!pdev) { + kfree(bd); + goto no_mem; + } + + offset = NODE_OFFSET(nasid); + + bd->bridge_addr = RAW_NODE_SWIN_BASE(nasid, widget); + bd->intr_addr = BIT_ULL(47) + 0x01800000 + PI_INT_PEND_MOD; + bd->nasid = nasid; + bd->masterwid = masterwid; + + bd->mem.name = "Bridge PCI MEM"; + bd->mem.start = offset + (widget << SWIN_SIZE_BITS); + bd->mem.end = bd->mem.start + SWIN_SIZE - 1; + bd->mem.flags = IORESOURCE_MEM; + bd->mem_offset = offset; + + bd->io.name = "Bridge PCI IO"; + bd->io.start = offset + (widget << SWIN_SIZE_BITS); + bd->io.end = bd->io.start + SWIN_SIZE - 1; + bd->io.flags = IORESOURCE_IO; + bd->io_offset = offset; + + platform_device_add_data(pdev, bd, sizeof(*bd)); + platform_device_add(pdev); + pr_info("xtalk:n%d/%x bridge widget\n", nasid, widget); + return; + +no_mem: + pr_warn("xtalk:n%d/%x bridge create out of memory\n", nasid, widget); +} static int probe_one_port(nasid_t nasid, int widget, int masterwid) { @@ -31,13 +75,10 @@ static int probe_one_port(nasid_t nasid, int widget, int masterwid) (RAW_NODE_SWIN_BASE(nasid, widget) + WIDGET_ID); partnum = XWIDGET_PART_NUM(widget_id); - printk(KERN_INFO "Cpu %d, Nasid 0x%x, widget 0x%x (partnum 0x%x) is ", - smp_processor_id(), nasid, widget, partnum); - switch (partnum) { case BRIDGE_WIDGET_PART_NUM: case XBRIDGE_WIDGET_PART_NUM: - bridge_probe(nasid, widget, masterwid); + bridge_platform_create(nasid, widget, masterwid); break; default: break; @@ -52,8 +93,6 @@ static int xbow_probe(nasid_t nasid) klxbow_t *xbow_p; unsigned masterwid, i; - printk("is xbow\n"); - /* * found xbow, so may have multiple bridges * need to probe xbow @@ -117,19 +156,17 @@ static void xtalk_probe_node(cnodeid_t nid) (RAW_NODE_SWIN_BASE(nasid, 0x0) + WIDGET_ID); partnum = XWIDGET_PART_NUM(widget_id); - printk(KERN_INFO "Cpu %d, Nasid 0x%x: partnum 0x%x is ", - smp_processor_id(), nasid, partnum); - switch (partnum) { case BRIDGE_WIDGET_PART_NUM: - bridge_probe(nasid, 0x8, 0xa); + bridge_platform_create(nasid, 0x8, 0xa); break; case XBOW_WIDGET_PART_NUM: case XXBOW_WIDGET_PART_NUM: + pr_info("xtalk:n%d/0 xbow widget\n", nasid); xbow_probe(nasid); break; default: - printk(" unknown widget??\n"); + pr_info("xtalk:n%d/0 unknown widget (0x%x)\n", nasid, partnum); break; } } diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c index 70a1ab66d252..46537c2ca86a 100644 --- a/arch/mips/txx9/generic/setup.c +++ b/arch/mips/txx9/generic/setup.c @@ -26,6 +26,7 @@ #include <linux/leds.h> #include <linux/device.h> #include <linux/slab.h> +#include <linux/io.h> #include <linux/irq.h> #include <asm/bootinfo.h> #include <asm/idle.h> diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 55559ca0efe4..2245169c72af 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -4,7 +4,7 @@ # config NDS32 - def_bool y + def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE @@ -51,20 +51,20 @@ config GENERIC_CALIBRATE_DELAY def_bool y config GENERIC_CSUM - def_bool y + def_bool y config GENERIC_HWEIGHT - def_bool y + def_bool y config GENERIC_LOCKBREAK - def_bool y + def_bool y depends on PREEMPT config TRACE_IRQFLAGS_SUPPORT def_bool y config STACKTRACE_SUPPORT - def_bool y + def_bool y config FIX_EARLYCON_MEM def_bool y @@ -79,11 +79,11 @@ config NR_CPUS default 1 config MMU - def_bool y + def_bool y config NDS32_BUILTIN_DTB - string "Builtin DTB" - default "" + string "Builtin DTB" + default "" help User can use it to specify the dts of the SoC endmenu diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index f67a327777b5..f43b44d692ca 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -6,7 +6,6 @@ generic-y += bugs.h generic-y += checksum.h generic-y += clkdev.h generic-y += cmpxchg.h -generic-y += cmpxchg-local.h generic-y += compat.h generic-y += cputime.h generic-y += device.h @@ -37,7 +36,6 @@ generic-y += pci.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h -generic-y += segment.h generic-y += serial.h generic-y += switch_to.h generic-y += timex.h diff --git a/arch/nds32/include/asm/assembler.h b/arch/nds32/include/asm/assembler.h index c3855782a541..5e7c56926049 100644 --- a/arch/nds32/include/asm/assembler.h +++ b/arch/nds32/include/asm/assembler.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_ASSEMBLER_H__ diff --git a/arch/nds32/include/asm/barrier.h b/arch/nds32/include/asm/barrier.h index faafc373ea6c..16413172fd50 100644 --- a/arch/nds32/include/asm/barrier.h +++ b/arch/nds32/include/asm/barrier.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_ASM_BARRIER_H diff --git a/arch/nds32/include/asm/bitfield.h b/arch/nds32/include/asm/bitfield.h index 7414fcbbab4e..e75212c76b20 100644 --- a/arch/nds32/include/asm/bitfield.h +++ b/arch/nds32/include/asm/bitfield.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_BITFIELD_H__ diff --git a/arch/nds32/include/asm/cache.h b/arch/nds32/include/asm/cache.h index 347db4881c5f..fc3c41b59169 100644 --- a/arch/nds32/include/asm/cache.h +++ b/arch/nds32/include/asm/cache.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_CACHE_H__ diff --git a/arch/nds32/include/asm/cache_info.h b/arch/nds32/include/asm/cache_info.h index 38ec458ba543..e89d8078f3a6 100644 --- a/arch/nds32/include/asm/cache_info.h +++ b/arch/nds32/include/asm/cache_info.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation struct cache_info { diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 8b26198d51bb..d9ac7e6408ef 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_CACHEFLUSH_H__ diff --git a/arch/nds32/include/asm/current.h b/arch/nds32/include/asm/current.h index b4dcd22b7bcb..65d30096142b 100644 --- a/arch/nds32/include/asm/current.h +++ b/arch/nds32/include/asm/current.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_NDS32_CURRENT_H diff --git a/arch/nds32/include/asm/delay.h b/arch/nds32/include/asm/delay.h index 519ba97acb6e..56ea3894f8f8 100644 --- a/arch/nds32/include/asm/delay.h +++ b/arch/nds32/include/asm/delay.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_DELAY_H__ diff --git a/arch/nds32/include/asm/elf.h b/arch/nds32/include/asm/elf.h index 02250626b9f0..1c8e56d7013d 100644 --- a/arch/nds32/include/asm/elf.h +++ b/arch/nds32/include/asm/elf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASMNDS32_ELF_H diff --git a/arch/nds32/include/asm/fixmap.h b/arch/nds32/include/asm/fixmap.h index 0e60e153a71a..5a4bf11e5800 100644 --- a/arch/nds32/include/asm/fixmap.h +++ b/arch/nds32/include/asm/fixmap.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_FIXMAP_H diff --git a/arch/nds32/include/asm/futex.h b/arch/nds32/include/asm/futex.h index baf178bf1d0b..5213c65c2e0b 100644 --- a/arch/nds32/include/asm/futex.h +++ b/arch/nds32/include/asm/futex.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_FUTEX_H__ diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index 425d546cb059..b3a82c97ded3 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_HIGHMEM_H diff --git a/arch/nds32/include/asm/io.h b/arch/nds32/include/asm/io.h index 5ef8ae5ba833..16f262322b8f 100644 --- a/arch/nds32/include/asm/io.h +++ b/arch/nds32/include/asm/io.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_IO_H diff --git a/arch/nds32/include/asm/irqflags.h b/arch/nds32/include/asm/irqflags.h index 2bfd00f8bc48..fb45ec46bb1b 100644 --- a/arch/nds32/include/asm/irqflags.h +++ b/arch/nds32/include/asm/irqflags.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #include <asm/nds32.h> diff --git a/arch/nds32/include/asm/l2_cache.h b/arch/nds32/include/asm/l2_cache.h index 37dd5ef61de8..3ea48e19e6de 100644 --- a/arch/nds32/include/asm/l2_cache.h +++ b/arch/nds32/include/asm/l2_cache.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef L2_CACHE_H diff --git a/arch/nds32/include/asm/linkage.h b/arch/nds32/include/asm/linkage.h index e708c8bdb926..a696469abb70 100644 --- a/arch/nds32/include/asm/linkage.h +++ b/arch/nds32/include/asm/linkage.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_LINKAGE_H diff --git a/arch/nds32/include/asm/memory.h b/arch/nds32/include/asm/memory.h index 60efc726b56e..940d32842793 100644 --- a/arch/nds32/include/asm/memory.h +++ b/arch/nds32/include/asm/memory.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_MEMORY_H @@ -15,14 +15,6 @@ #define PHYS_OFFSET (0x0) #endif -#ifndef __virt_to_bus -#define __virt_to_bus __virt_to_phys -#endif - -#ifndef __bus_to_virt -#define __bus_to_virt __phys_to_virt -#endif - /* * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area diff --git a/arch/nds32/include/asm/mmu.h b/arch/nds32/include/asm/mmu.h index 88b9ee8c1064..89d63afee455 100644 --- a/arch/nds32/include/asm/mmu.h +++ b/arch/nds32/include/asm/mmu.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_MMU_H diff --git a/arch/nds32/include/asm/mmu_context.h b/arch/nds32/include/asm/mmu_context.h index fd7d13cefccc..b8fd3d189fdc 100644 --- a/arch/nds32/include/asm/mmu_context.h +++ b/arch/nds32/include/asm/mmu_context.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_MMU_CONTEXT_H diff --git a/arch/nds32/include/asm/module.h b/arch/nds32/include/asm/module.h index 16cf9c7237ad..a3a08e993c65 100644 --- a/arch/nds32/include/asm/module.h +++ b/arch/nds32/include/asm/module.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_NDS32_MODULE_H diff --git a/arch/nds32/include/asm/nds32.h b/arch/nds32/include/asm/nds32.h index 68c38151c3e4..4994f6a9e0a0 100644 --- a/arch/nds32/include/asm/nds32.h +++ b/arch/nds32/include/asm/nds32.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_NDS32_NDS32_H_ diff --git a/arch/nds32/include/asm/page.h b/arch/nds32/include/asm/page.h index 947f0491c9a7..8feb1fa12f01 100644 --- a/arch/nds32/include/asm/page.h +++ b/arch/nds32/include/asm/page.h @@ -1,5 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * SPDX-License-Identifier: GPL-2.0 * Copyright (C) 2005-2017 Andes Technology Corporation */ diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index 3c5fee5b5759..3cbc749c79aa 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_PGALLOC_H diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index ee59c1f9e4fc..c70cc56bec09 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_PGTABLE_H diff --git a/arch/nds32/include/asm/proc-fns.h b/arch/nds32/include/asm/proc-fns.h index bedc4f59e064..27c617fa77af 100644 --- a/arch/nds32/include/asm/proc-fns.h +++ b/arch/nds32/include/asm/proc-fns.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_PROCFNS_H__ diff --git a/arch/nds32/include/asm/processor.h b/arch/nds32/include/asm/processor.h index 72024f8bc129..b82369c7659d 100644 --- a/arch/nds32/include/asm/processor.h +++ b/arch/nds32/include/asm/processor.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_PROCESSOR_H diff --git a/arch/nds32/include/asm/ptrace.h b/arch/nds32/include/asm/ptrace.h index c4538839055c..919ee223620c 100644 --- a/arch/nds32/include/asm/ptrace.h +++ b/arch/nds32/include/asm/ptrace.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_PTRACE_H diff --git a/arch/nds32/include/asm/shmparam.h b/arch/nds32/include/asm/shmparam.h index fd1cff64b68e..3aeee946973d 100644 --- a/arch/nds32/include/asm/shmparam.h +++ b/arch/nds32/include/asm/shmparam.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_SHMPARAM_H diff --git a/arch/nds32/include/asm/string.h b/arch/nds32/include/asm/string.h index 179272caa540..cae8fe16de98 100644 --- a/arch/nds32/include/asm/string.h +++ b/arch/nds32/include/asm/string.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_STRING_H diff --git a/arch/nds32/include/asm/swab.h b/arch/nds32/include/asm/swab.h index e01a755a37d2..362a466f2976 100644 --- a/arch/nds32/include/asm/swab.h +++ b/arch/nds32/include/asm/swab.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_SWAB_H__ diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h index 174b8571d362..899b2fb4b52f 100644 --- a/arch/nds32/include/asm/syscall.h +++ b/arch/nds32/include/asm/syscall.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. // Copyright (C) 2005-2017 Andes Technology Corporation diff --git a/arch/nds32/include/asm/syscalls.h b/arch/nds32/include/asm/syscalls.h index da32101b455d..f3b16f602cb5 100644 --- a/arch/nds32/include/asm/syscalls.h +++ b/arch/nds32/include/asm/syscalls.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_SYSCALLS_H diff --git a/arch/nds32/include/asm/thread_info.h b/arch/nds32/include/asm/thread_info.h index bff741ff337b..c135111ec44e 100644 --- a/arch/nds32/include/asm/thread_info.h +++ b/arch/nds32/include/asm/thread_info.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_THREAD_INFO_H @@ -42,7 +42,6 @@ struct thread_info { * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user - * TIF_USEDFPU - FPU was used by this task this quantum (SMP) * TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_SIGPENDING 1 @@ -50,7 +49,6 @@ struct thread_info { #define TIF_SINGLESTEP 3 #define TIF_NOTIFY_RESUME 4 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 -#define TIF_USEDFPU 16 #define TIF_POLLING_NRFLAG 17 #define TIF_MEMDIE 18 #define TIF_FREEZE 19 diff --git a/arch/nds32/include/asm/tlb.h b/arch/nds32/include/asm/tlb.h index d5ae571c8d30..a8aff1c8b4f4 100644 --- a/arch/nds32/include/asm/tlb.h +++ b/arch/nds32/include/asm/tlb.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASMNDS32_TLB_H diff --git a/arch/nds32/include/asm/tlbflush.h b/arch/nds32/include/asm/tlbflush.h index 38ee769b18d8..97155366ea01 100644 --- a/arch/nds32/include/asm/tlbflush.h +++ b/arch/nds32/include/asm/tlbflush.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_TLBFLUSH_H diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 116598b47c4d..8916ad9f9f13 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMANDES_UACCESS_H diff --git a/arch/nds32/include/asm/unistd.h b/arch/nds32/include/asm/unistd.h index b586a2862beb..bf5e2d440913 100644 --- a/arch/nds32/include/asm/unistd.h +++ b/arch/nds32/include/asm/unistd.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #define __ARCH_WANT_SYS_CLONE diff --git a/arch/nds32/include/asm/vdso.h b/arch/nds32/include/asm/vdso.h index af2c6afc2469..89b113ffc3dc 100644 --- a/arch/nds32/include/asm/vdso.h +++ b/arch/nds32/include/asm/vdso.h @@ -1,5 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * SPDX-License-Identifier: GPL-2.0 * Copyright (C) 2005-2017 Andes Technology Corporation */ diff --git a/arch/nds32/include/asm/vdso_datapage.h b/arch/nds32/include/asm/vdso_datapage.h index 79db5a12ca5e..74c68802021e 100644 --- a/arch/nds32/include/asm/vdso_datapage.h +++ b/arch/nds32/include/asm/vdso_datapage.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2012 ARM Limited // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_VDSO_DATAPAGE_H @@ -20,6 +20,7 @@ struct vdso_data { u32 xtime_clock_sec; /* CLOCK_REALTIME - seconds */ u32 cs_mult; /* clocksource multiplier */ u32 cs_shift; /* Cycle to nanosecond divisor (power of two) */ + u32 hrtimer_res; /* hrtimer resolution */ u64 cs_cycle_last; /* last cycle value */ u64 cs_mask; /* clocksource mask */ diff --git a/arch/nds32/include/asm/vdso_timer_info.h b/arch/nds32/include/asm/vdso_timer_info.h index 50ba117cff12..328439ce37db 100644 --- a/arch/nds32/include/asm/vdso_timer_info.h +++ b/arch/nds32/include/asm/vdso_timer_info.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation extern struct timer_info_t timer_info; diff --git a/arch/nds32/include/uapi/asm/auxvec.h b/arch/nds32/include/uapi/asm/auxvec.h index 2d3213f5e595..b5d58ea8decb 100644 --- a/arch/nds32/include/uapi/asm/auxvec.h +++ b/arch/nds32/include/uapi/asm/auxvec.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_AUXVEC_H diff --git a/arch/nds32/include/uapi/asm/byteorder.h b/arch/nds32/include/uapi/asm/byteorder.h index a23f6f3a2468..511e653c709d 100644 --- a/arch/nds32/include/uapi/asm/byteorder.h +++ b/arch/nds32/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_BYTEORDER_H__ diff --git a/arch/nds32/include/uapi/asm/cachectl.h b/arch/nds32/include/uapi/asm/cachectl.h index 4cdca9b23974..73793662815c 100644 --- a/arch/nds32/include/uapi/asm/cachectl.h +++ b/arch/nds32/include/uapi/asm/cachectl.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 1994, 1995, 1996 by Ralf Baechle // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_CACHECTL diff --git a/arch/nds32/include/uapi/asm/param.h b/arch/nds32/include/uapi/asm/param.h index e3fb723ee362..2977534a6bd3 100644 --- a/arch/nds32/include/uapi/asm/param.h +++ b/arch/nds32/include/uapi/asm/param.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_PARAM_H diff --git a/arch/nds32/include/uapi/asm/ptrace.h b/arch/nds32/include/uapi/asm/ptrace.h index 358c99e399d0..1a6e01c00e6f 100644 --- a/arch/nds32/include/uapi/asm/ptrace.h +++ b/arch/nds32/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __UAPI_ASM_NDS32_PTRACE_H diff --git a/arch/nds32/include/uapi/asm/sigcontext.h b/arch/nds32/include/uapi/asm/sigcontext.h index 58afc416473e..628ff6b75825 100644 --- a/arch/nds32/include/uapi/asm/sigcontext.h +++ b/arch/nds32/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_SIGCONTEXT_H diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h index 4ec8f543103f..c691735017ed 100644 --- a/arch/nds32/include/uapi/asm/unistd.h +++ b/arch/nds32/include/uapi/asm/unistd.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2005-2017 Andes Technology Corporation #define __ARCH_WANT_STAT64 diff --git a/arch/nds32/kernel/.gitignore b/arch/nds32/kernel/.gitignore new file mode 100644 index 000000000000..c5f676c3c224 --- /dev/null +++ b/arch/nds32/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds diff --git a/arch/nds32/kernel/cacheinfo.c b/arch/nds32/kernel/cacheinfo.c index 0a7bc696dd55..aab98e447feb 100644 --- a/arch/nds32/kernel/cacheinfo.c +++ b/arch/nds32/kernel/cacheinfo.c @@ -13,7 +13,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->level = level; this_leaf->type = type; this_leaf->coherency_line_size = CACHE_LINE_SIZE(cache_type); - this_leaf->number_of_sets = CACHE_SET(cache_type);; + this_leaf->number_of_sets = CACHE_SET(cache_type); this_leaf->ways_of_associativity = CACHE_WAY(cache_type); this_leaf->size = this_leaf->number_of_sets * this_leaf->coherency_line_size * this_leaf->ways_of_associativity; diff --git a/arch/nds32/kernel/ex-exit.S b/arch/nds32/kernel/ex-exit.S index 97ba15cd4180..1df02a793364 100644 --- a/arch/nds32/kernel/ex-exit.S +++ b/arch/nds32/kernel/ex-exit.S @@ -163,7 +163,7 @@ resume_kernel: gie_disable lwi $t0, [tsk+#TSK_TI_PREEMPT] bnez $t0, no_work_pending -need_resched: + lwi $t0, [tsk+#TSK_TI_FLAGS] andi $p1, $t0, #_TIF_NEED_RESCHED beqz $p1, no_work_pending @@ -173,7 +173,7 @@ need_resched: beqz $t0, no_work_pending jal preempt_schedule_irq - b need_resched + b no_work_pending #endif /* diff --git a/arch/nds32/kernel/nds32_ksyms.c b/arch/nds32/kernel/nds32_ksyms.c index 5ecebd0e60cb..20719e42ae36 100644 --- a/arch/nds32/kernel/nds32_ksyms.c +++ b/arch/nds32/kernel/nds32_ksyms.c @@ -23,9 +23,3 @@ EXPORT_SYMBOL(memzero); EXPORT_SYMBOL(__arch_copy_from_user); EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__arch_clear_user); - -/* cache handling */ -EXPORT_SYMBOL(cpu_icache_inval_all); -EXPORT_SYMBOL(cpu_dcache_wbinval_all); -EXPORT_SYMBOL(cpu_dma_inval_range); -EXPORT_SYMBOL(cpu_dma_wb_range); diff --git a/arch/nds32/kernel/vdso.c b/arch/nds32/kernel/vdso.c index 016f15891f6d..90bcae6f8554 100644 --- a/arch/nds32/kernel/vdso.c +++ b/arch/nds32/kernel/vdso.c @@ -220,6 +220,7 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->xtime_coarse_sec = tk->xtime_sec; vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_data->hrtimer_res = hrtimer_resolution; vdso_write_end(vdso_data); } diff --git a/arch/nds32/kernel/vdso/.gitignore b/arch/nds32/kernel/vdso/.gitignore new file mode 100644 index 000000000000..f8b69d84238e --- /dev/null +++ b/arch/nds32/kernel/vdso/.gitignore @@ -0,0 +1 @@ +vdso.lds diff --git a/arch/nds32/kernel/vdso/Makefile b/arch/nds32/kernel/vdso/Makefile index e6c50a701313..8792fda19a64 100644 --- a/arch/nds32/kernel/vdso/Makefile +++ b/arch/nds32/kernel/vdso/Makefile @@ -11,10 +11,8 @@ obj-vdso := note.o datapage.o sigreturn.o gettimeofday.o targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) -ccflags-y += -fPIC -Wl,-shared -g +ccflags-y := -shared -fno-common -fno-builtin -nostdlib -fPIC -Wl,-shared -g \ + -Wl,-soname=linux-vdso.so.1 -Wl,--hash-style=sysv # Disable gcov profiling for VDSO code GCOV_PROFILE := n @@ -28,7 +26,7 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH) $(obj)/vdso.o : $(obj)/vdso.so # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) @@ -40,9 +38,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # Generate VDSO offsets using helper script gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ -endef + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) @@ -65,7 +61,7 @@ gettimeofday.o : gettimeofday.c FORCE # Actual build commands quiet_cmd_vdsold = VDSOL $@ - cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $^ -o $@ + cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $(real-prereqs) -o $@ quiet_cmd_vdsoas = VDSOA $@ cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< quiet_cmd_vdsocc = VDSOA $@ diff --git a/arch/nds32/kernel/vdso/gettimeofday.c b/arch/nds32/kernel/vdso/gettimeofday.c index 038721af40e3..b02581891c33 100644 --- a/arch/nds32/kernel/vdso/gettimeofday.c +++ b/arch/nds32/kernel/vdso/gettimeofday.c @@ -208,6 +208,8 @@ static notrace int clock_getres_fallback(clockid_t _clk_id, notrace int __vdso_clock_getres(clockid_t clk_id, struct timespec *res) { + struct vdso_data *vdata = __get_datapage(); + if (res == NULL) return 0; switch (clk_id) { @@ -215,7 +217,7 @@ notrace int __vdso_clock_getres(clockid_t clk_id, struct timespec *res) case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_RAW: res->tv_sec = 0; - res->tv_nsec = CLOCK_REALTIME_RES; + res->tv_nsec = vdata->hrtimer_res; break; case CLOCK_REALTIME_COARSE: case CLOCK_MONOTONIC_COARSE: diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 1a4ab1b7525f..55703b03d172 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -260,7 +260,7 @@ void __set_fixmap(enum fixed_addresses idx, BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); - pte = (pte_t *)&fixmap_pmd_p[pte_index(addr)];; + pte = (pte_t *)&fixmap_pmd_p[pte_index(addr)]; if (pgprot_val(flags)) { set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index d7ef3512504a..a8ffdd007f6c 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -33,7 +33,6 @@ generic-y += pci.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h -generic-y += segment.h generic-y += serial.h generic-y += spinlock.h generic-y += topology.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 1919cc5e0f11..164be10062bc 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -34,7 +34,6 @@ generic-y += qspinlock.h generic-y += qrwlock_types.h generic-y += qrwlock.h generic-y += sections.h -generic-y += segment.h generic-y += shmparam.h generic-y += switch_to.h generic-y += topology.h diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c index eb97a8e7c8aa..e8fb2a764f46 100644 --- a/arch/openrisc/kernel/ptrace.c +++ b/arch/openrisc/kernel/ptrace.c @@ -30,7 +30,6 @@ #include <linux/elf.h> #include <asm/thread_info.h> -#include <asm/segment.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index c605bdad1746..17c00d06d91b 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -39,7 +39,6 @@ #include <linux/device.h> #include <asm/sections.h> -#include <asm/segment.h> #include <asm/pgtable.h> #include <asm/types.h> #include <asm/setup.h> diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index d8981cbb852a..6ed7293ef007 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -35,7 +35,6 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> -#include <asm/segment.h> #include <asm/io.h> #include <asm/pgtable.h> #include <asm/unwinder.h> diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index abe87e54e231..e63cb4a91a3e 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -32,7 +32,6 @@ #include <linux/blkdev.h> /* for initrd_* */ #include <linux/pagemap.h> -#include <asm/segment.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/dma.h> diff --git a/arch/openrisc/mm/tlb.c b/arch/openrisc/mm/tlb.c index 6c253a2e86bc..7f9f50161dfe 100644 --- a/arch/openrisc/mm/tlb.c +++ b/arch/openrisc/mm/tlb.c @@ -26,7 +26,6 @@ #include <linux/mm.h> #include <linux/init.h> -#include <asm/segment.h> #include <asm/tlbflush.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index ed2d8cc94909..005ee8ad0446 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -19,7 +19,6 @@ generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += seccomp.h -generic-y += segment.h generic-y += trace_clock.h generic-y += user.h generic-y += vga.h diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index fe8ca623add8..c9e377d59232 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -424,3 +424,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 1d1183048cfd..2781ebf6add4 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -93,6 +93,7 @@ #define VMALLOC_REGION_ID NON_LINEAR_REGION_ID(H_VMALLOC_START) #define IO_REGION_ID NON_LINEAR_REGION_ID(H_KERN_IO_START) #define VMEMMAP_REGION_ID NON_LINEAR_REGION_ID(H_VMEMMAP_START) +#define INVALID_REGION_ID (VMEMMAP_REGION_ID + 1) /* * Defines the address of the vmemap area, in its own region on @@ -119,14 +120,15 @@ static inline int get_region_id(unsigned long ea) if (id == 0) return USER_REGION_ID; + if (id != (PAGE_OFFSET >> 60)) + return INVALID_REGION_ID; + if (ea < H_KERN_VIRT_START) return LINEAR_MAP_REGION_ID; - VM_BUG_ON(id != 0xc); BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2); region_id = NON_LINEAR_REGION_ID(ea); - VM_BUG_ON(region_id > VMEMMAP_REGION_ID); return region_id; } diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6b5bb012ccb..013c76a0a03e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -201,6 +201,8 @@ struct kvmppc_spapr_tce_iommu_table { struct kref kref; }; +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; @@ -210,6 +212,7 @@ struct kvmppc_spapr_tce_table { u64 offset; /* in pages */ u64 size; /* window size in pages */ struct list_head iommu_tables; + struct mutex alloc_lock; struct page *pages[0]; }; @@ -222,6 +225,7 @@ extern struct kvm_device_ops kvm_xics_ops; struct kvmppc_xive; struct kvmppc_xive_vcpu; extern struct kvm_device_ops kvm_xive_ops; +extern struct kvm_device_ops kvm_xive_native_ops; struct kvmppc_passthru_irqmap; @@ -312,7 +316,11 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; - struct kvmppc_xive *xive; + struct kvmppc_xive *xive; /* Current XIVE device in use */ + struct { + struct kvmppc_xive *native; + struct kvmppc_xive *xics_on_xive; + } xive_devices; struct kvmppc_passthru_irqmap *pimap; #endif struct kvmppc_ops *kvm_ops; @@ -449,6 +457,7 @@ struct kvmppc_passthru_irqmap { #define KVMPPC_IRQ_DEFAULT 0 #define KVMPPC_IRQ_MPIC 1 #define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ +#define KVMPPC_IRQ_XIVE 3 /* XIVE native exploitation mode */ #define MMIO_HPTE_CACHE_SIZE 4 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ac22b28ae78d..bc892380e6cd 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -197,10 +197,6 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table( (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ (stt)->size, (ioba), (npages)) ? \ H_PARAMETER : H_SUCCESS) -extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, - unsigned long *ua, unsigned long **prmap); -extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, - unsigned long idx, unsigned long tce); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, @@ -273,6 +269,7 @@ union kvmppc_one_reg { u64 addr; u64 length; } vpaval; + u64 xive_timaval[2]; }; struct kvmppc_ops { @@ -480,6 +477,9 @@ extern void kvm_hv_vm_activated(void); extern void kvm_hv_vm_deactivated(void); extern bool kvm_hv_mode_active(void); +extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu, + struct kvm_nested_guest *nested); + #else static inline void __init kvm_cma_reserve(void) {} @@ -594,6 +594,22 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); + +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE; +} + +extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu); +extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu); +extern void kvmppc_xive_native_init_module(void); +extern void kvmppc_xive_native_exit_module(void); +extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, + union kvmppc_one_reg *val); +extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, + union kvmppc_one_reg *val); + #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { return -1; } @@ -617,6 +633,21 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } + +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) + { return 0; } +static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; } +static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { } +static inline void kvmppc_xive_native_init_module(void) { } +static inline void kvmppc_xive_native_exit_module(void) { } +static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, + union kvmppc_one_reg *val) +{ return 0; } +static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, + union kvmppc_one_reg *val) +{ return -ENOENT; } + #endif /* CONFIG_KVM_XIVE */ #if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER) @@ -665,6 +696,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index); long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index); +long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long dest, unsigned long src); long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, unsigned long slb_v, unsigned int status, bool data); unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index b579a943407b..eaf76f57023a 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -23,6 +23,7 @@ * same offset regardless of where the code is executing */ extern void __iomem *xive_tima; +extern unsigned long xive_tima_os; /* * Offset in the TM area of our current execution level (provided by @@ -73,6 +74,8 @@ struct xive_q { u32 esc_irq; atomic_t count; atomic_t pending_count; + u64 guest_qaddr; + u32 guest_qshift; }; /* Global enable flags for the XIVE support */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 26ca425f4c2c..b0f72dea8b11 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -482,6 +482,8 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */ #define KVM_REG_PPC_ICP_PPRI_MASK 0xff +#define KVM_REG_PPC_VP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x8d) + /* Device control API: PPC-specific devices */ #define KVM_DEV_MPIC_GRP_MISC 1 #define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ @@ -677,4 +679,48 @@ struct kvm_ppc_cpu_char { #define KVM_XICS_PRESENTED (1ULL << 43) #define KVM_XICS_QUEUED (1ULL << 44) +/* POWER9 XIVE Native Interrupt Controller */ +#define KVM_DEV_XIVE_GRP_CTRL 1 +#define KVM_DEV_XIVE_RESET 1 +#define KVM_DEV_XIVE_EQ_SYNC 2 +#define KVM_DEV_XIVE_GRP_SOURCE 2 /* 64-bit source identifier */ +#define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */ +#define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */ +#define KVM_DEV_XIVE_GRP_SOURCE_SYNC 5 /* 64-bit source identifier */ + +/* Layout of 64-bit XIVE source attribute values */ +#define KVM_XIVE_LEVEL_SENSITIVE (1ULL << 0) +#define KVM_XIVE_LEVEL_ASSERTED (1ULL << 1) + +/* Layout of 64-bit XIVE source configuration attribute values */ +#define KVM_XIVE_SOURCE_PRIORITY_SHIFT 0 +#define KVM_XIVE_SOURCE_PRIORITY_MASK 0x7 +#define KVM_XIVE_SOURCE_SERVER_SHIFT 3 +#define KVM_XIVE_SOURCE_SERVER_MASK 0xfffffff8ULL +#define KVM_XIVE_SOURCE_MASKED_SHIFT 32 +#define KVM_XIVE_SOURCE_MASKED_MASK 0x100000000ULL +#define KVM_XIVE_SOURCE_EISN_SHIFT 33 +#define KVM_XIVE_SOURCE_EISN_MASK 0xfffffffe00000000ULL + +/* Layout of 64-bit EQ identifier */ +#define KVM_XIVE_EQ_PRIORITY_SHIFT 0 +#define KVM_XIVE_EQ_PRIORITY_MASK 0x7 +#define KVM_XIVE_EQ_SERVER_SHIFT 3 +#define KVM_XIVE_EQ_SERVER_MASK 0xfffffff8ULL + +/* Layout of EQ configuration values (64 bytes) */ +struct kvm_ppc_xive_eq { + __u32 flags; + __u32 qshift; + __u64 qaddr; + __u32 qtoggle; + __u32 qindex; + __u8 pad[40]; +}; + +#define KVM_XIVE_EQ_ALWAYS_NOTIFY 0x00000001 + +#define KVM_XIVE_TIMA_PAGE_OFFSET 0 +#define KVM_XIVE_ESB_PAGE_OFFSET 4 + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index f2ed3ef4b129..862e2890bd3d 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -767,7 +767,6 @@ static void cacheinfo_create_index_dir(struct cache *cache, int index, cache_dir->kobj, "index%d", index); if (rc) { kobject_put(&index_dir->kobj); - kfree(index_dir); return; } diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 00f5a63c8d9a..103655d84b4b 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -509,3 +509,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 3223aec88b2c..4c67cc79de7c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -94,7 +94,7 @@ endif kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o -kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o +kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o book3s_xive_native.o kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o kvm-book3s_64-module-objs := \ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 10c5579d20ce..61a212d0daf0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -651,6 +651,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); break; #endif /* CONFIG_KVM_XICS */ +#ifdef CONFIG_KVM_XIVE + case KVM_REG_PPC_VP_STATE: + if (!vcpu->arch.xive_vcpu) { + r = -ENXIO; + break; + } + if (xive_enabled()) + r = kvmppc_xive_native_get_vp(vcpu, val); + else + r = -ENXIO; + break; +#endif /* CONFIG_KVM_XIVE */ case KVM_REG_PPC_FSCR: *val = get_reg_val(id, vcpu->arch.fscr); break; @@ -724,6 +736,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); break; #endif /* CONFIG_KVM_XICS */ +#ifdef CONFIG_KVM_XIVE + case KVM_REG_PPC_VP_STATE: + if (!vcpu->arch.xive_vcpu) { + r = -ENXIO; + break; + } + if (xive_enabled()) + r = kvmppc_xive_native_set_vp(vcpu, val); + else + r = -ENXIO; + break; +#endif /* CONFIG_KVM_XIVE */ case KVM_REG_PPC_FSCR: vcpu->arch.fscr = set_reg_val(id, *val); break; @@ -891,6 +915,17 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) kvmppc_rtas_tokens_free(kvm); WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); #endif + +#ifdef CONFIG_KVM_XICS + /* + * Free the XIVE devices which are not directly freed by the + * device 'release' method + */ + kfree(kvm->arch.xive_devices.native); + kvm->arch.xive_devices.native = NULL; + kfree(kvm->arch.xive_devices.xics_on_xive); + kvm->arch.xive_devices.xics_on_xive = NULL; +#endif /* CONFIG_KVM_XICS */ } int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) @@ -1050,6 +1085,9 @@ static int kvmppc_book3s_init(void) if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); + kvmppc_xive_native_init_module(); + kvm_register_device_ops(&kvm_xive_native_ops, + KVM_DEV_TYPE_XIVE); } else #endif kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); @@ -1060,8 +1098,10 @@ static int kvmppc_book3s_init(void) static void kvmppc_book3s_exit(void) { #ifdef CONFIG_KVM_XICS - if (xics_on_xive()) + if (xics_on_xive()) { kvmppc_xive_exit_module(); + kvmppc_xive_native_exit_module(); + } #endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kvmppc_book3s_exit_pr(); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index f100e331e69b..66270e07449a 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -228,11 +228,33 @@ static void release_spapr_tce_table(struct rcu_head *head) unsigned long i, npages = kvmppc_tce_pages(stt->size); for (i = 0; i < npages; i++) - __free_page(stt->pages[i]); + if (stt->pages[i]) + __free_page(stt->pages[i]); kfree(stt); } +static struct page *kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table *stt, + unsigned long sttpage) +{ + struct page *page = stt->pages[sttpage]; + + if (page) + return page; + + mutex_lock(&stt->alloc_lock); + page = stt->pages[sttpage]; + if (!page) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + WARN_ON_ONCE(!page); + if (page) + stt->pages[sttpage] = page; + } + mutex_unlock(&stt->alloc_lock); + + return page; +} + static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) { struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data; @@ -241,7 +263,10 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) return VM_FAULT_SIGBUS; - page = stt->pages[vmf->pgoff]; + page = kvm_spapr_get_tce_page(stt, vmf->pgoff); + if (!page) + return VM_FAULT_OOM; + get_page(page); vmf->page = page; return 0; @@ -296,7 +321,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *siter; unsigned long npages, size = args->size; int ret = -ENOMEM; - int i; if (!args->size || args->page_shift < 12 || args->page_shift > 34 || (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) @@ -318,14 +342,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, stt->offset = args->offset; stt->size = size; stt->kvm = kvm; + mutex_init(&stt->alloc_lock); INIT_LIST_HEAD_RCU(&stt->iommu_tables); - for (i = 0; i < npages; i++) { - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!stt->pages[i]) - goto fail; - } - mutex_lock(&kvm->lock); /* Check this LIOBN hasn't been previously allocated */ @@ -352,17 +371,28 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, if (ret >= 0) return ret; - fail: - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); - kfree(stt); fail_acct: kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } +static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, + unsigned long *ua) +{ + unsigned long gfn = tce >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + memslot = search_memslots(kvm_memslots(kvm), gfn); + if (!memslot) + return -EINVAL; + + *ua = __gfn_to_hva_memslot(memslot, gfn) | + (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + + return 0; +} + static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) { @@ -378,7 +408,7 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_TOO_HARD; - if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua)) return H_TOO_HARD; list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { @@ -397,6 +427,36 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } +/* + * Handles TCE requests for emulated devices. + * Puts guest TCE values to the table and expects user space to convert them. + * Cannot fail so kvmppc_tce_validate must be called before it. + */ +static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, + unsigned long idx, unsigned long tce) +{ + struct page *page; + u64 *tbl; + unsigned long sttpage; + + idx -= stt->offset; + sttpage = idx / TCES_PER_PAGE; + page = stt->pages[sttpage]; + + if (!page) { + /* We allow any TCE, not just with read|write permissions */ + if (!tce) + return; + + page = kvm_spapr_get_tce_page(stt, sttpage); + if (!page) + return; + } + tbl = page_to_virt(page); + + tbl[idx % TCES_PER_PAGE] = tce; +} + static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, unsigned long entry) { @@ -551,7 +611,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; goto unlock_exit; } @@ -612,7 +672,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua)) { ret = H_TOO_HARD; goto unlock_exit; } @@ -647,7 +707,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, } tce = be64_to_cpu(tce); - if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 2206bc729b9a..484b47fa3960 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -66,8 +66,6 @@ #endif -#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) - /* * Finds a TCE table descriptor by LIOBN. * @@ -88,6 +86,25 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvmppc_find_table); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce, + unsigned long *ua, unsigned long **prmap) +{ + unsigned long gfn = tce >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + memslot = search_memslots(kvm_memslots_raw(kvm), gfn); + if (!memslot) + return -EINVAL; + + *ua = __gfn_to_hva_memslot(memslot, gfn) | + (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + + if (prmap) + *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + + return 0; +} + /* * Validates TCE address. * At the moment flags and page mask are validated. @@ -111,7 +128,7 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_PARAMETER; - if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua, NULL)) return H_TOO_HARD; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -129,7 +146,6 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } -#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ /* Note on the use of page_address() in real mode, * @@ -161,13 +177,9 @@ static u64 *kvmppc_page_address(struct page *page) /* * Handles TCE requests for emulated devices. * Puts guest TCE values to the table and expects user space to convert them. - * Called in both real and virtual modes. - * Cannot fail so kvmppc_tce_validate must be called before it. - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM + * Cannot fail so kvmppc_rm_tce_validate must be called before it. */ -void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, +static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, unsigned long idx, unsigned long tce) { struct page *page; @@ -175,35 +187,48 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; + /* + * page must not be NULL in real mode, + * kvmppc_rm_ioba_validate() must have taken care of this. + */ + WARN_ON_ONCE_RM(!page); tbl = kvmppc_page_address(page); tbl[idx % TCES_PER_PAGE] = tce; } -EXPORT_SYMBOL_GPL(kvmppc_tce_put); -long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, - unsigned long *ua, unsigned long **prmap) +/* + * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so + * in real mode. + * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is + * allocated or not required (when clearing a tce entry). + */ +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages, bool clearing) { - unsigned long gfn = tce >> PAGE_SHIFT; - struct kvm_memory_slot *memslot; + unsigned long i, idx, sttpage, sttpages; + unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); - memslot = search_memslots(kvm_memslots(kvm), gfn); - if (!memslot) - return -EINVAL; - - *ua = __gfn_to_hva_memslot(memslot, gfn) | - (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + if (ret) + return ret; + /* + * clearing==true says kvmppc_rm_tce_put won't be allocating pages + * for empty tces. + */ + if (clearing) + return H_SUCCESS; -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - if (prmap) - *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; -#endif + idx = (ioba >> stt->page_shift) - stt->offset; + sttpage = idx / TCES_PER_PAGE; + sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) / + TCES_PER_PAGE; + for (i = sttpage; i < sttpage + sttpages; ++i) + if (!stt->pages[i]) + return H_TOO_HARD; - return 0; + return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua); -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction) @@ -381,7 +406,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (!stt) return H_TOO_HARD; - ret = kvmppc_ioba_validate(stt, ioba, 1); + ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0); if (ret != H_SUCCESS) return ret; @@ -390,7 +415,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return ret; dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) + if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; entry = ioba >> stt->page_shift; @@ -409,7 +434,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, } } - kvmppc_tce_put(stt, entry, tce); + kvmppc_rm_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -480,7 +505,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (tce_list & (SZ_4K - 1)) return H_PARAMETER; - ret = kvmppc_ioba_validate(stt, ioba, npages); + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false); if (ret != H_SUCCESS) return ret; @@ -492,7 +517,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ struct mm_iommu_table_group_mem_t *mem; - if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) return H_TOO_HARD; mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); @@ -508,7 +533,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, * We do not require memory to be preregistered in this case * so lock rmap and do __find_linux_pte_or_hugepte(). */ - if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) return H_TOO_HARD; rmap = (void *) vmalloc_to_phys(rmap); @@ -542,7 +567,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -557,7 +582,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, } } - kvmppc_tce_put(stt, entry + i, tce); + kvmppc_rm_tce_put(stt, entry + i, tce); } unlock_exit: @@ -583,7 +608,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, if (!stt) return H_TOO_HARD; - ret = kvmppc_ioba_validate(stt, ioba, npages); + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0); if (ret != H_SUCCESS) return ret; @@ -610,7 +635,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); + kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); return H_SUCCESS; } @@ -635,6 +660,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, idx = (ioba >> stt->page_shift) - stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; + if (!page) { + vcpu->arch.regs.gpr[4] = 0; + return H_SUCCESS; + } tbl = (u64 *)page_address(page); vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7bdcd4d7a9f0..d5fc624e0655 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -750,7 +750,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) /* * Ensure that the read of vcore->dpdes comes after the read * of vcpu->doorbell_request. This barrier matches the - * smb_wmb() in kvmppc_guest_entry_inject(). + * smp_wmb() in kvmppc_guest_entry_inject(). */ smp_rmb(); vc = vcpu->arch.vcore; @@ -802,6 +802,80 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, } } +/* Copy guest memory in place - must reside within a single memslot */ +static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from, + unsigned long len) +{ + struct kvm_memory_slot *to_memslot = NULL; + struct kvm_memory_slot *from_memslot = NULL; + unsigned long to_addr, from_addr; + int r; + + /* Get HPA for from address */ + from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT); + if (!from_memslot) + return -EFAULT; + if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages) + << PAGE_SHIFT)) + return -EINVAL; + from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT); + if (kvm_is_error_hva(from_addr)) + return -EFAULT; + from_addr |= (from & (PAGE_SIZE - 1)); + + /* Get HPA for to address */ + to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT); + if (!to_memslot) + return -EFAULT; + if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages) + << PAGE_SHIFT)) + return -EINVAL; + to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT); + if (kvm_is_error_hva(to_addr)) + return -EFAULT; + to_addr |= (to & (PAGE_SIZE - 1)); + + /* Perform copy */ + r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr, + len); + if (r) + return -EFAULT; + mark_page_dirty(kvm, to >> PAGE_SHIFT); + return 0; +} + +static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long dest, unsigned long src) +{ + u64 pg_sz = SZ_4K; /* 4K page size */ + u64 pg_mask = SZ_4K - 1; + int ret; + + /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */ + if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE | + H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED)) + return H_PARAMETER; + + /* dest (and src if copy_page flag set) must be page aligned */ + if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask))) + return H_PARAMETER; + + /* zero and/or copy the page as determined by the flags */ + if (flags & H_COPY_PAGE) { + ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz); + if (ret < 0) + return H_PARAMETER; + } else if (flags & H_ZERO_PAGE) { + ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz); + if (ret < 0) + return H_PARAMETER; + } + + /* We can ignore the remaining flags */ + + return H_SUCCESS; +} + static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) { struct kvmppc_vcore *vcore = target->arch.vcore; @@ -1004,6 +1078,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (nesting_enabled(vcpu->kvm)) ret = kvmhv_copy_tofrom_guest_nested(vcpu); break; + case H_PAGE_INIT: + ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + break; default: return RESUME_HOST; } @@ -1048,6 +1127,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd) case H_IPOLL: case H_XIRR_X: #endif + case H_PAGE_INIT: return 1; } @@ -2505,37 +2585,6 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) } } -static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu, - struct kvm_nested_guest *nested) -{ - cpumask_t *need_tlb_flush; - int lpid; - - if (!cpu_has_feature(CPU_FTR_HVMODE)) - return; - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - pcpu &= ~0x3UL; - - if (nested) { - lpid = nested->shadow_lpid; - need_tlb_flush = &nested->need_tlb_flush; - } else { - lpid = kvm->arch.lpid; - need_tlb_flush = &kvm->arch.need_tlb_flush; - } - - mtspr(SPRN_LPID, lpid); - isync(); - smp_mb(); - - if (cpumask_test_cpu(pcpu, need_tlb_flush)) { - radix__local_flush_tlb_lpid_guest(lpid); - /* Clear the bit after the TLB flush */ - cpumask_clear_cpu(pcpu, need_tlb_flush); - } -} - static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; @@ -3229,19 +3278,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) for (sub = 0; sub < core_info.n_subcores; ++sub) spin_unlock(&core_info.vc[sub]->lock); - if (kvm_is_radix(vc->kvm)) { - /* - * Do we need to flush the process scoped TLB for the LPAR? - * - * On POWER9, individual threads can come in here, but the - * TLB is shared between the 4 threads in a core, hence - * invalidating on one thread invalidates for all. - * Thus we make all 4 threads use the same bit here. - * - * Hash must be flushed in realmode in order to use tlbiel. - */ - kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL); - } + guest_enter_irqoff(); + + srcu_idx = srcu_read_lock(&vc->kvm->srcu); + + this_cpu_disable_ftrace(); /* * Interrupts will be enabled once we get into the guest, @@ -3249,19 +3290,14 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ trace_hardirqs_on(); - guest_enter_irqoff(); - - srcu_idx = srcu_read_lock(&vc->kvm->srcu); - - this_cpu_disable_ftrace(); - trap = __kvmppc_vcore_entry(); + trace_hardirqs_off(); + this_cpu_enable_ftrace(); srcu_read_unlock(&vc->kvm->srcu, srcu_idx); - trace_hardirqs_off(); set_irq_happened(trap); spin_lock(&vc->lock); @@ -3514,6 +3550,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, #ifdef CONFIG_ALTIVEC load_vr_state(&vcpu->arch.vr); #endif + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); mtspr(SPRN_DSCR, vcpu->arch.dscr); mtspr(SPRN_IAMR, vcpu->arch.iamr); @@ -3605,6 +3642,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, #ifdef CONFIG_ALTIVEC store_vr_state(&vcpu->arch.vr); #endif + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); if (cpu_has_feature(CPU_FTR_TM) || cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) @@ -3970,7 +4008,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, unsigned long lpcr) { int trap, r, pcpu; - int srcu_idx; + int srcu_idx, lpid; struct kvmppc_vcore *vc; struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; @@ -4046,8 +4084,12 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, vc->vcore_state = VCORE_RUNNING; trace_kvmppc_run_core(vc, 0); - if (cpu_has_feature(CPU_FTR_HVMODE)) - kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; + mtspr(SPRN_LPID, lpid); + isync(); + kvmppc_check_need_tlb_flush(kvm, pcpu, nested); + } trace_hardirqs_on(); guest_enter_irqoff(); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index b0cf22477e87..6035d24f1d1d 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -805,3 +805,60 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) vcpu->arch.doorbell_request = 0; } } + +static void flush_guest_tlb(struct kvm *kvm) +{ + unsigned long rb, set; + + rb = PPC_BIT(52); /* IS = 2 */ + if (kvm_is_radix(kvm)) { + /* R=1 PRS=1 RIC=2 */ + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r" (rb), "i" (1), "i" (1), "i" (2), + "r" (0) : "memory"); + for (set = 1; set < kvm->arch.tlb_sets; ++set) { + rb += PPC_BIT(51); /* increment set number */ + /* R=1 PRS=1 RIC=0 */ + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r" (rb), "i" (1), "i" (1), "i" (0), + "r" (0) : "memory"); + } + } else { + for (set = 0; set < kvm->arch.tlb_sets; ++set) { + /* R=0 PRS=0 RIC=0 */ + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r" (rb), "i" (0), "i" (0), "i" (0), + "r" (0) : "memory"); + rb += PPC_BIT(51); /* increment set number */ + } + } + asm volatile("ptesync": : :"memory"); +} + +void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu, + struct kvm_nested_guest *nested) +{ + cpumask_t *need_tlb_flush; + + /* + * On POWER9, individual threads can come in here, but the + * TLB is shared between the 4 threads in a core, hence + * invalidating on one thread invalidates for all. + * Thus we make all 4 threads use the same bit. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pcpu = cpu_first_thread_sibling(pcpu); + + if (nested) + need_tlb_flush = &nested->need_tlb_flush; + else + need_tlb_flush = &kvm->arch.need_tlb_flush; + + if (cpumask_test_cpu(pcpu, need_tlb_flush)) { + flush_guest_tlb(kvm); + + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu, need_tlb_flush); + } +} +EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 3b3791ed74a6..8431ad1e8391 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -13,6 +13,7 @@ #include <linux/hugetlb.h> #include <linux/module.h> #include <linux/log2.h> +#include <linux/sizes.h> #include <asm/trace.h> #include <asm/kvm_ppc.h> @@ -867,6 +868,149 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, return ret; } +static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa, + int writing, unsigned long *hpa, + struct kvm_memory_slot **memslot_p) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *memslot; + unsigned long gfn, hva, pa, psize = PAGE_SHIFT; + unsigned int shift; + pte_t *ptep, pte; + + /* Find the memslot for this address */ + gfn = gpa >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return H_PARAMETER; + + /* Translate to host virtual address */ + hva = __gfn_to_hva_memslot(memslot, gfn); + + /* Try to find the host pte for that virtual address */ + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + if (!ptep) + return H_TOO_HARD; + pte = kvmppc_read_update_linux_pte(ptep, writing); + if (!pte_present(pte)) + return H_TOO_HARD; + + /* Convert to a physical address */ + if (shift) + psize = 1UL << shift; + pa = pte_pfn(pte) << PAGE_SHIFT; + pa |= hva & (psize - 1); + pa |= gpa & ~PAGE_MASK; + + if (hpa) + *hpa = pa; + if (memslot_p) + *memslot_p = memslot; + + return H_SUCCESS; +} + +static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu, + unsigned long dest) +{ + struct kvm_memory_slot *memslot; + struct kvm *kvm = vcpu->kvm; + unsigned long pa, mmu_seq; + long ret = H_SUCCESS; + int i; + + /* Used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + ret = kvmppc_get_hpa(vcpu, dest, 1, &pa, &memslot); + if (ret != H_SUCCESS) + return ret; + + /* Check if we've been invalidated */ + raw_spin_lock(&kvm->mmu_lock.rlock); + if (mmu_notifier_retry(kvm, mmu_seq)) { + ret = H_TOO_HARD; + goto out_unlock; + } + + /* Zero the page */ + for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES) + dcbz((void *)pa); + kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE); + +out_unlock: + raw_spin_unlock(&kvm->mmu_lock.rlock); + return ret; +} + +static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu, + unsigned long dest, unsigned long src) +{ + unsigned long dest_pa, src_pa, mmu_seq; + struct kvm_memory_slot *dest_memslot; + struct kvm *kvm = vcpu->kvm; + long ret = H_SUCCESS; + + /* Used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + ret = kvmppc_get_hpa(vcpu, dest, 1, &dest_pa, &dest_memslot); + if (ret != H_SUCCESS) + return ret; + ret = kvmppc_get_hpa(vcpu, src, 0, &src_pa, NULL); + if (ret != H_SUCCESS) + return ret; + + /* Check if we've been invalidated */ + raw_spin_lock(&kvm->mmu_lock.rlock); + if (mmu_notifier_retry(kvm, mmu_seq)) { + ret = H_TOO_HARD; + goto out_unlock; + } + + /* Copy the page */ + memcpy((void *)dest_pa, (void *)src_pa, SZ_4K); + + kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE); + +out_unlock: + raw_spin_unlock(&kvm->mmu_lock.rlock); + return ret; +} + +long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long dest, unsigned long src) +{ + struct kvm *kvm = vcpu->kvm; + u64 pg_mask = SZ_4K - 1; /* 4K page size */ + long ret = H_SUCCESS; + + /* Don't handle radix mode here, go up to the virtual mode handler */ + if (kvm_is_radix(kvm)) + return H_TOO_HARD; + + /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */ + if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE | + H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED)) + return H_PARAMETER; + + /* dest (and src if copy_page flag set) must be page aligned */ + if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask))) + return H_PARAMETER; + + /* zero and/or copy the page as determined by the flags */ + if (flags & H_COPY_PAGE) + ret = kvmppc_do_h_page_init_copy(vcpu, dest, src); + else if (flags & H_ZERO_PAGE) + ret = kvmppc_do_h_page_init_zero(vcpu, dest); + + /* We can ignore the other flags */ + + return ret; +} + void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index) { diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index dd014308f065..f9b2620fbecd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -589,11 +589,8 @@ kvmppc_hv_entry: 1: #endif - /* Use cr7 as an indication of radix mode */ ld r5, HSTATE_KVM_VCORE(r13) ld r9, VCORE_KVM(r5) /* pointer to struct kvm */ - lbz r0, KVM_RADIX(r9) - cmpwi cr7, r0, 0 /* * POWER7/POWER8 host -> guest partition switch code. @@ -616,9 +613,6 @@ kvmppc_hv_entry: cmpwi r6,0 bne 10f - /* Radix has already switched LPID and flushed core TLB */ - bne cr7, 22f - lwz r7,KVM_LPID(r9) BEGIN_FTR_SECTION ld r6,KVM_SDR1(r9) @@ -630,41 +624,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_LPID,r7 isync - /* See if we need to flush the TLB. Hash has to be done in RM */ - lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ -BEGIN_FTR_SECTION - /* - * On POWER9, individual threads can come in here, but the - * TLB is shared between the 4 threads in a core, hence - * invalidating on one thread invalidates for all. - * Thus we make all 4 threads use the same bit here. - */ - clrrdi r6,r6,2 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - clrldi r7,r6,64-6 /* extract bit number (6 bits) */ - srdi r6,r6,6 /* doubleword number */ - sldi r6,r6,3 /* address offset */ - add r6,r6,r9 - addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ - li r8,1 - sld r8,r8,r7 - ld r7,0(r6) - and. r7,r7,r8 - beq 22f - /* Flush the TLB of any entries for this LPID */ - lwz r0,KVM_TLB_SETS(r9) - mtctr r0 - li r7,0x800 /* IS field = 0b10 */ - ptesync - li r0,0 /* RS for P9 version of tlbiel */ -28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */ - addi r7,r7,0x1000 - bdnz 28b - ptesync -23: ldarx r7,0,r6 /* clear the bit after TLB flushed */ - andc r7,r7,r8 - stdcx. r7,0,r6 - bne 23b + /* See if we need to flush the TLB. */ + mr r3, r9 /* kvm pointer */ + lhz r4, PACAPACAINDEX(r13) /* physical cpu number */ + li r5, 0 /* nested vcpu pointer */ + bl kvmppc_check_need_tlb_flush + nop + ld r5, HSTATE_KVM_VCORE(r13) /* Add timebase offset onto timebase */ 22: ld r8,VCORE_TB_OFFSET(r5) @@ -980,17 +946,27 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) #ifdef CONFIG_KVM_XICS /* We are entering the guest on that thread, push VCPU to XIVE */ - ld r10, HSTATE_XIVE_TIMA_PHYS(r13) - cmpldi cr0, r10, 0 - beq no_xive ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS + lwz r8, VCPU_XIVE_CAM_WORD(r4) + li r7, TM_QW1_OS + TM_WORD2 + mfmsr r0 + andi. r0, r0, MSR_DR /* in real mode? */ + beq 2f + ld r10, HSTATE_XIVE_TIMA_VIRT(r13) + cmpldi cr1, r10, 0 + beq cr1, no_xive + eieio + stdx r11,r9,r10 + stwx r8,r7,r10 + b 3f +2: ld r10, HSTATE_XIVE_TIMA_PHYS(r13) + cmpldi cr1, r10, 0 + beq cr1, no_xive eieio stdcix r11,r9,r10 - lwz r11, VCPU_XIVE_CAM_WORD(r4) - li r9, TM_QW1_OS + TM_WORD2 - stwcix r11,r9,r10 - li r9, 1 + stwcix r8,r7,r10 +3: li r9, 1 stb r9, VCPU_XIVE_PUSHED(r4) eieio @@ -1009,12 +985,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) * on, we mask it. */ lbz r0, VCPU_XIVE_ESC_ON(r4) - cmpwi r0,0 - beq 1f - ld r10, VCPU_XIVE_ESC_RADDR(r4) + cmpwi cr1, r0,0 + beq cr1, 1f li r9, XIVE_ESB_SET_PQ_01 + beq 4f /* in real mode? */ + ld r10, VCPU_XIVE_ESC_VADDR(r4) + ldx r0, r10, r9 + b 5f +4: ld r10, VCPU_XIVE_ESC_RADDR(r4) ldcix r0, r10, r9 - sync +5: sync /* We have a possible subtle race here: The escalation interrupt might * have fired and be on its way to the host queue while we mask it, @@ -2292,7 +2272,7 @@ hcall_real_table: #endif .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table - .long 0 /* 0x2c */ + .long DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table .long 0 /* 0x30 */ .long 0 /* 0x34 */ .long 0 /* 0x38 */ diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index f78d002f0fe0..4953957333b7 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -166,7 +166,8 @@ static irqreturn_t xive_esc_irq(int irq, void *data) return IRQ_HANDLED; } -static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) +int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, + bool single_escalation) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; struct xive_q *q = &xc->queues[prio]; @@ -185,7 +186,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) return -EIO; } - if (xc->xive->single_escalation) + if (single_escalation) name = kasprintf(GFP_KERNEL, "kvm-%d-%d", vcpu->kvm->arch.lpid, xc->server_num); else @@ -217,7 +218,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) * interrupt, thus leaving it effectively masked after * it fires once. */ - if (xc->xive->single_escalation) { + if (single_escalation) { struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -291,7 +292,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) continue; rc = xive_provision_queue(vcpu, prio); if (rc == 0 && !xive->single_escalation) - xive_attach_escalation(vcpu, prio); + kvmppc_xive_attach_escalation(vcpu, prio, + xive->single_escalation); if (rc) return rc; } @@ -342,7 +344,7 @@ static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio) return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY; } -static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio) +int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio) { struct kvm_vcpu *vcpu; int i, rc; @@ -380,11 +382,6 @@ static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio) return -EBUSY; } -static u32 xive_vp(struct kvmppc_xive *xive, u32 server) -{ - return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server); -} - static u8 xive_lock_and_mask(struct kvmppc_xive *xive, struct kvmppc_xive_src_block *sb, struct kvmppc_xive_irq_state *state) @@ -430,8 +427,8 @@ static u8 xive_lock_and_mask(struct kvmppc_xive *xive, */ if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { xive_native_configure_irq(hw_num, - xive_vp(xive, state->act_server), - MASKED, state->number); + kvmppc_xive_vp(xive, state->act_server), + MASKED, state->number); /* set old_p so we can track if an H_EOI was done */ state->old_p = true; state->old_q = false; @@ -486,8 +483,8 @@ static void xive_finish_unmask(struct kvmppc_xive *xive, */ if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { xive_native_configure_irq(hw_num, - xive_vp(xive, state->act_server), - state->act_priority, state->number); + kvmppc_xive_vp(xive, state->act_server), + state->act_priority, state->number); /* If an EOI is needed, do it here */ if (!state->old_p) xive_vm_source_eoi(hw_num, xd); @@ -535,7 +532,7 @@ static int xive_target_interrupt(struct kvm *kvm, * priority. The count for that new target will have * already been incremented. */ - rc = xive_select_target(kvm, &server, prio); + rc = kvmppc_xive_select_target(kvm, &server, prio); /* * We failed to find a target ? Not much we can do @@ -563,7 +560,7 @@ static int xive_target_interrupt(struct kvm *kvm, kvmppc_xive_select_irq(state, &hw_num, NULL); return xive_native_configure_irq(hw_num, - xive_vp(xive, server), + kvmppc_xive_vp(xive, server), prio, state->number); } @@ -849,7 +846,8 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) /* * We can't update the state of a "pushed" VCPU, but that - * shouldn't happen. + * shouldn't happen because the vcpu->mutex makes running a + * vcpu mutually exclusive with doing one_reg get/set on it. */ if (WARN_ON(vcpu->arch.xive_pushed)) return -EIO; @@ -940,6 +938,13 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, /* Turn the IPI hard off */ xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); + /* + * Reset ESB guest mapping. Needed when ESB pages are exposed + * to the guest in XIVE native mode + */ + if (xive->ops && xive->ops->reset_mapped) + xive->ops->reset_mapped(kvm, guest_irq); + /* Grab info about irq */ state->pt_number = hw_irq; state->pt_data = irq_data_get_irq_handler_data(host_data); @@ -951,7 +956,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, * which is fine for a never started interrupt. */ xive_native_configure_irq(hw_irq, - xive_vp(xive, state->act_server), + kvmppc_xive_vp(xive, state->act_server), state->act_priority, state->number); /* @@ -1025,9 +1030,17 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, state->pt_number = 0; state->pt_data = NULL; + /* + * Reset ESB guest mapping. Needed when ESB pages are exposed + * to the guest in XIVE native mode + */ + if (xive->ops && xive->ops->reset_mapped) { + xive->ops->reset_mapped(kvm, guest_irq); + } + /* Reconfigure the IPI */ xive_native_configure_irq(state->ipi_number, - xive_vp(xive, state->act_server), + kvmppc_xive_vp(xive, state->act_server), state->act_priority, state->number); /* @@ -1049,7 +1062,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, } EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped); -static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) +void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; struct kvm *kvm = vcpu->kvm; @@ -1083,14 +1096,35 @@ static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) arch_spin_unlock(&sb->lock); } } + + /* Disable vcpu's escalation interrupt */ + if (vcpu->arch.xive_esc_on) { + __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + + XIVE_ESB_SET_PQ_01)); + vcpu->arch.xive_esc_on = false; + } + + /* + * Clear pointers to escalation interrupt ESB. + * This is safe because the vcpu->mutex is held, preventing + * any other CPU from concurrently executing a KVM_RUN ioctl. + */ + vcpu->arch.xive_esc_vaddr = 0; + vcpu->arch.xive_esc_raddr = 0; } void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - struct kvmppc_xive *xive = xc->xive; + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; int i; + if (!kvmppc_xics_enabled(vcpu)) + return; + + if (!xc) + return; + pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); /* Ensure no interrupt is still routed to that VP */ @@ -1129,6 +1163,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) } /* Free the VP */ kfree(xc); + + /* Cleanup the vcpu */ + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; + vcpu->arch.xive_vcpu = NULL; } int kvmppc_xive_connect_vcpu(struct kvm_device *dev, @@ -1146,7 +1184,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, } if (xive->kvm != vcpu->kvm) return -EPERM; - if (vcpu->arch.irq_type) + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { pr_devel("Duplicate !\n"); @@ -1166,7 +1204,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, xc->xive = xive; xc->vcpu = vcpu; xc->server_num = cpu; - xc->vp_id = xive_vp(xive, cpu); + xc->vp_id = kvmppc_xive_vp(xive, cpu); xc->mfrr = 0xff; xc->valid = true; @@ -1219,7 +1257,8 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (xive->qmap & (1 << i)) { r = xive_provision_queue(vcpu, i); if (r == 0 && !xive->single_escalation) - xive_attach_escalation(vcpu, i); + kvmppc_xive_attach_escalation( + vcpu, i, xive->single_escalation); if (r) goto bail; } else { @@ -1234,7 +1273,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, } /* If not done above, attach priority 0 escalation */ - r = xive_attach_escalation(vcpu, 0); + r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation); if (r) goto bail; @@ -1485,8 +1524,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) return 0; } -static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive, - int irq) +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( + struct kvmppc_xive *xive, int irq) { struct kvm *kvm = xive->kvm; struct kvmppc_xive_src_block *sb; @@ -1509,6 +1548,7 @@ static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *x for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i; + sb->irq_state[i].eisn = 0; sb->irq_state[i].guest_priority = MASKED; sb->irq_state[i].saved_priority = MASKED; sb->irq_state[i].act_priority = MASKED; @@ -1565,7 +1605,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) sb = kvmppc_xive_find_source(xive, irq, &idx); if (!sb) { pr_devel("No source, creating source block...\n"); - sb = xive_create_src_block(xive, irq); + sb = kvmppc_xive_create_src_block(xive, irq); if (!sb) { pr_devel("Failed to create block...\n"); return -ENOMEM; @@ -1789,7 +1829,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd) xive_cleanup_irq_data(xd); } -static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) { int i; @@ -1810,16 +1850,55 @@ static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) } } -static void kvmppc_xive_free(struct kvm_device *dev) +/* + * Called when device fd is closed. kvm->lock is held. + */ +static void kvmppc_xive_release(struct kvm_device *dev) { struct kvmppc_xive *xive = dev->private; struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; int i; + int was_ready; + + pr_devel("Releasing xive device\n"); debugfs_remove(xive->dentry); - if (kvm) - kvm->arch.xive = NULL; + /* + * Clearing mmu_ready temporarily while holding kvm->lock + * is a way of ensuring that no vcpus can enter the guest + * until we drop kvm->lock. Doing kick_all_cpus_sync() + * ensures that any vcpu executing inside the guest has + * exited the guest. Once kick_all_cpus_sync() has finished, + * we know that no vcpu can be executing the XIVE push or + * pull code, or executing a XICS hcall. + * + * Since this is the device release function, we know that + * userspace does not have any open fd referring to the + * device. Therefore there can not be any of the device + * attribute set/get functions being executed concurrently, + * and similarly, the connect_vcpu and set/clr_mapped + * functions also cannot be being executed. + */ + was_ready = kvm->arch.mmu_ready; + kvm->arch.mmu_ready = 0; + kick_all_cpus_sync(); + + /* + * We should clean up the vCPU interrupt presenters first. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + /* + * Take vcpu->mutex to ensure that no one_reg get/set ioctl + * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently. + */ + mutex_lock(&vcpu->mutex); + kvmppc_xive_cleanup_vcpu(vcpu); + mutex_unlock(&vcpu->mutex); + } + + kvm->arch.xive = NULL; /* Mask and free interrupts */ for (i = 0; i <= xive->max_sbid; i++) { @@ -1832,11 +1911,47 @@ static void kvmppc_xive_free(struct kvm_device *dev) if (xive->vp_base != XIVE_INVALID_VP) xive_native_free_vp_block(xive->vp_base); + kvm->arch.mmu_ready = was_ready; + + /* + * A reference of the kvmppc_xive pointer is now kept under + * the xive_devices struct of the machine for reuse. It is + * freed when the VM is destroyed for now until we fix all the + * execution paths. + */ - kfree(xive); kfree(dev); } +/* + * When the guest chooses the interrupt mode (XICS legacy or XIVE + * native), the VM will switch of KVM device. The previous device will + * be "released" before the new one is created. + * + * Until we are sure all execution paths are well protected, provide a + * fail safe (transitional) method for device destruction, in which + * the XIVE device pointer is recycled and not directly freed. + */ +struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type) +{ + struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ? + &kvm->arch.xive_devices.native : + &kvm->arch.xive_devices.xics_on_xive; + struct kvmppc_xive *xive = *kvm_xive_device; + + if (!xive) { + xive = kzalloc(sizeof(*xive), GFP_KERNEL); + *kvm_xive_device = xive; + } else { + memset(xive, 0, sizeof(*xive)); + } + + return xive; +} + +/* + * Create a XICS device with XIVE backend. kvm->lock is held. + */ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) { struct kvmppc_xive *xive; @@ -1845,7 +1960,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) pr_devel("Creating xive for partition\n"); - xive = kzalloc(sizeof(*xive), GFP_KERNEL); + xive = kvmppc_xive_get_device(kvm, type); if (!xive) return -ENOMEM; @@ -1883,6 +1998,43 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) return 0; } +int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + unsigned int i; + + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + u32 i0, i1, idx; + + if (!q->qpage && !xc->esc_virq[i]) + continue; + + seq_printf(m, " [q%d]: ", i); + + if (q->qpage) { + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + seq_printf(m, "T=%d %08x %08x...\n", q->toggle, + i0, i1); + } + if (xc->esc_virq[i]) { + struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]); + struct xive_irq_data *xd = + irq_data_get_irq_handler_data(d); + u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); + + seq_printf(m, "E:%c%c I(%d:%llx:%llx)", + (pq & XIVE_ESB_VAL_P) ? 'P' : 'p', + (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q', + xc->esc_virq[i], pq, xd->eoi_page); + seq_puts(m, "\n"); + } + } + return 0; +} static int xive_debug_show(struct seq_file *m, void *private) { @@ -1908,7 +2060,6 @@ static int xive_debug_show(struct seq_file *m, void *private) kvm_for_each_vcpu(i, vcpu, kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - unsigned int i; if (!xc) continue; @@ -1918,33 +2069,8 @@ static int xive_debug_show(struct seq_file *m, void *private) xc->server_num, xc->cppr, xc->hw_cppr, xc->mfrr, xc->pending, xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); - for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { - struct xive_q *q = &xc->queues[i]; - u32 i0, i1, idx; - - if (!q->qpage && !xc->esc_virq[i]) - continue; - seq_printf(m, " [q%d]: ", i); - - if (q->qpage) { - idx = q->idx; - i0 = be32_to_cpup(q->qpage + idx); - idx = (idx + 1) & q->msk; - i1 = be32_to_cpup(q->qpage + idx); - seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1); - } - if (xc->esc_virq[i]) { - struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]); - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); - u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); - seq_printf(m, "E:%c%c I(%d:%llx:%llx)", - (pq & XIVE_ESB_VAL_P) ? 'P' : 'p', - (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q', - xc->esc_virq[i], pq, xd->eoi_page); - seq_printf(m, "\n"); - } - } + kvmppc_xive_debug_show_queues(m, vcpu); t_rm_h_xirr += xc->stat_rm_h_xirr; t_rm_h_ipoll += xc->stat_rm_h_ipoll; @@ -1999,7 +2125,7 @@ struct kvm_device_ops kvm_xive_ops = { .name = "kvm-xive", .create = kvmppc_xive_create, .init = kvmppc_xive_init, - .destroy = kvmppc_xive_free, + .release = kvmppc_xive_release, .set_attr = xive_set_attr, .get_attr = xive_get_attr, .has_attr = xive_has_attr, diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index a08ae6fd4c51..426146332984 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -13,6 +13,13 @@ #include "book3s_xics.h" /* + * The XIVE Interrupt source numbers are within the range 0 to + * KVMPPC_XICS_NR_IRQS. + */ +#define KVMPPC_XIVE_FIRST_IRQ 0 +#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS + +/* * State for one guest irq source. * * For each guest source we allocate a HW interrupt in the XIVE @@ -54,6 +61,9 @@ struct kvmppc_xive_irq_state { bool saved_p; bool saved_q; u8 saved_scan_prio; + + /* Xive native */ + u32 eisn; /* Guest Effective IRQ number */ }; /* Select the "right" interrupt (IPI vs. passthrough) */ @@ -84,6 +94,11 @@ struct kvmppc_xive_src_block { struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; }; +struct kvmppc_xive; + +struct kvmppc_xive_ops { + int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq); +}; struct kvmppc_xive { struct kvm *kvm; @@ -122,6 +137,10 @@ struct kvmppc_xive { /* Flags */ u8 single_escalation; + + struct kvmppc_xive_ops *ops; + struct address_space *mapping; + struct mutex mapping_lock; }; #define KVMPPC_XIVE_Q_COUNT 8 @@ -198,6 +217,11 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp return xive->src_blocks[bid]; } +static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server) +{ + return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server); +} + /* * Mapping between guest priorities and host priorities * is as follow. @@ -248,5 +272,18 @@ extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr); extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr); +/* + * Common Xive routines for XICS-over-XIVE and XIVE native + */ +void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu); +int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu); +struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( + struct kvmppc_xive *xive, int irq); +void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb); +int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); +int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, + bool single_escalation); +struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); + #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c new file mode 100644 index 000000000000..6a8e698c4b6e --- /dev/null +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -0,0 +1,1249 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2017-2019, IBM Corporation. + */ + +#define pr_fmt(fmt) "xive-kvm: " fmt + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/debug.h> +#include <asm/debugfs.h> +#include <asm/opal.h> + +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "book3s_xive.h" + +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) + offset |= offset << 4; + + val = in_be64(xd->eoi_mmio + offset); + return (u8)val; +} + +static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct xive_q *q = &xc->queues[prio]; + + xive_native_disable_queue(xc->vp_id, q, prio); + if (q->qpage) { + put_page(virt_to_page(q->qpage)); + q->qpage = NULL; + } +} + +void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + int i; + + if (!kvmppc_xive_enabled(vcpu)) + return; + + if (!xc) + return; + + pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num); + + /* Ensure no interrupt is still routed to that VP */ + xc->valid = false; + kvmppc_xive_disable_vcpu_interrupts(vcpu); + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Free the queues & associated interrupts */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + /* Free the escalation irq */ + if (xc->esc_virq[i]) { + free_irq(xc->esc_virq[i], vcpu); + irq_dispose_mapping(xc->esc_virq[i]); + kfree(xc->esc_virq_names[i]); + xc->esc_virq[i] = 0; + } + + /* Free the queue */ + kvmppc_xive_native_cleanup_queue(vcpu, i); + } + + /* Free the VP */ + kfree(xc); + + /* Cleanup the vcpu */ + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; + vcpu->arch.xive_vcpu = NULL; +} + +int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 server_num) +{ + struct kvmppc_xive *xive = dev->private; + struct kvmppc_xive_vcpu *xc = NULL; + int rc; + + pr_devel("native_connect_vcpu(server=%d)\n", server_num); + + if (dev->ops != &kvm_xive_native_ops) { + pr_devel("Wrong ops !\n"); + return -EPERM; + } + if (xive->kvm != vcpu->kvm) + return -EPERM; + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) + return -EBUSY; + if (server_num >= KVM_MAX_VCPUS) { + pr_devel("Out of bounds !\n"); + return -EINVAL; + } + + mutex_lock(&vcpu->kvm->lock); + + if (kvmppc_xive_find_server(vcpu->kvm, server_num)) { + pr_devel("Duplicate !\n"); + rc = -EEXIST; + goto bail; + } + + xc = kzalloc(sizeof(*xc), GFP_KERNEL); + if (!xc) { + rc = -ENOMEM; + goto bail; + } + + vcpu->arch.xive_vcpu = xc; + xc->xive = xive; + xc->vcpu = vcpu; + xc->server_num = server_num; + + xc->vp_id = kvmppc_xive_vp(xive, server_num); + xc->valid = true; + vcpu->arch.irq_type = KVMPPC_IRQ_XIVE; + + rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id); + if (rc) { + pr_err("Failed to get VP info from OPAL: %d\n", rc); + goto bail; + } + + /* + * Enable the VP first as the single escalation mode will + * affect escalation interrupts numbering + */ + rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + if (rc) { + pr_err("Failed to enable VP in OPAL: %d\n", rc); + goto bail; + } + + /* Configure VCPU fields for use by assembly push/pull */ + vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); + vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); + + /* TODO: reset all queues to a clean state ? */ +bail: + mutex_unlock(&vcpu->kvm->lock); + if (rc) + kvmppc_xive_native_cleanup_vcpu(vcpu); + + return rc; +} + +/* + * Device passthrough support + */ +static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq) +{ + struct kvmppc_xive *xive = kvm->arch.xive; + + if (irq >= KVMPPC_XIVE_NR_IRQS) + return -EINVAL; + + /* + * Clear the ESB pages of the IRQ number being mapped (or + * unmapped) into the guest and let the the VM fault handler + * repopulate with the appropriate ESB pages (device or IC) + */ + pr_debug("clearing esb pages for girq 0x%lx\n", irq); + mutex_lock(&xive->mapping_lock); + if (xive->mapping) + unmap_mapping_range(xive->mapping, + irq * (2ull << PAGE_SHIFT), + 2ull << PAGE_SHIFT, 1); + mutex_unlock(&xive->mapping_lock); + return 0; +} + +static struct kvmppc_xive_ops kvmppc_xive_native_ops = { + .reset_mapped = kvmppc_xive_native_reset_mapped, +}; + +static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct kvm_device *dev = vma->vm_file->private_data; + struct kvmppc_xive *xive = dev->private; + struct kvmppc_xive_src_block *sb; + struct kvmppc_xive_irq_state *state; + struct xive_irq_data *xd; + u32 hw_num; + u16 src; + u64 page; + unsigned long irq; + u64 page_offset; + + /* + * Linux/KVM uses a two pages ESB setting, one for trigger and + * one for EOI + */ + page_offset = vmf->pgoff - vma->vm_pgoff; + irq = page_offset / 2; + + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) { + pr_devel("%s: source %lx not found !\n", __func__, irq); + return VM_FAULT_SIGBUS; + } + + state = &sb->irq_state[src]; + kvmppc_xive_select_irq(state, &hw_num, &xd); + + arch_spin_lock(&sb->lock); + + /* + * first/even page is for trigger + * second/odd page is for EOI and management. + */ + page = page_offset % 2 ? xd->eoi_page : xd->trig_page; + arch_spin_unlock(&sb->lock); + + if (WARN_ON(!page)) { + pr_err("%s: accessing invalid ESB page for source %lx !\n", + __func__, irq); + return VM_FAULT_SIGBUS; + } + + vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT); + return VM_FAULT_NOPAGE; +} + +static const struct vm_operations_struct xive_native_esb_vmops = { + .fault = xive_native_esb_fault, +}; + +static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + switch (vmf->pgoff - vma->vm_pgoff) { + case 0: /* HW - forbid access */ + case 1: /* HV - forbid access */ + return VM_FAULT_SIGBUS; + case 2: /* OS */ + vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT); + return VM_FAULT_NOPAGE; + case 3: /* USER - TODO */ + default: + return VM_FAULT_SIGBUS; + } +} + +static const struct vm_operations_struct xive_native_tima_vmops = { + .fault = xive_native_tima_fault, +}; + +static int kvmppc_xive_native_mmap(struct kvm_device *dev, + struct vm_area_struct *vma) +{ + struct kvmppc_xive *xive = dev->private; + + /* We only allow mappings at fixed offset for now */ + if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) { + if (vma_pages(vma) > 4) + return -EINVAL; + vma->vm_ops = &xive_native_tima_vmops; + } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) { + if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2) + return -EINVAL; + vma->vm_ops = &xive_native_esb_vmops; + } else { + return -EINVAL; + } + + vma->vm_flags |= VM_IO | VM_PFNMAP; + vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); + + /* + * Grab the KVM device file address_space to be able to clear + * the ESB pages mapping when a device is passed-through into + * the guest. + */ + xive->mapping = vma->vm_file->f_mapping; + return 0; +} + +static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq, + u64 addr) +{ + struct kvmppc_xive_src_block *sb; + struct kvmppc_xive_irq_state *state; + u64 __user *ubufp = (u64 __user *) addr; + u64 val; + u16 idx; + int rc; + + pr_devel("%s irq=0x%lx\n", __func__, irq); + + if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS) + return -E2BIG; + + sb = kvmppc_xive_find_source(xive, irq, &idx); + if (!sb) { + pr_debug("No source, creating source block...\n"); + sb = kvmppc_xive_create_src_block(xive, irq); + if (!sb) { + pr_err("Failed to create block...\n"); + return -ENOMEM; + } + } + state = &sb->irq_state[idx]; + + if (get_user(val, ubufp)) { + pr_err("fault getting user info !\n"); + return -EFAULT; + } + + arch_spin_lock(&sb->lock); + + /* + * If the source doesn't already have an IPI, allocate + * one and get the corresponding data + */ + if (!state->ipi_number) { + state->ipi_number = xive_native_alloc_irq(); + if (state->ipi_number == 0) { + pr_err("Failed to allocate IRQ !\n"); + rc = -ENXIO; + goto unlock; + } + xive_native_populate_irq_data(state->ipi_number, + &state->ipi_data); + pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__, + state->ipi_number, irq); + } + + /* Restore LSI state */ + if (val & KVM_XIVE_LEVEL_SENSITIVE) { + state->lsi = true; + if (val & KVM_XIVE_LEVEL_ASSERTED) + state->asserted = true; + pr_devel(" LSI ! Asserted=%d\n", state->asserted); + } + + /* Mask IRQ to start with */ + state->act_server = 0; + state->act_priority = MASKED; + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); + xive_native_configure_irq(state->ipi_number, 0, MASKED, 0); + + /* Increment the number of valid sources and mark this one valid */ + if (!state->valid) + xive->src_count++; + state->valid = true; + + rc = 0; + +unlock: + arch_spin_unlock(&sb->lock); + + return rc; +} + +static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive, + struct kvmppc_xive_src_block *sb, + struct kvmppc_xive_irq_state *state, + u32 server, u8 priority, bool masked, + u32 eisn) +{ + struct kvm *kvm = xive->kvm; + u32 hw_num; + int rc = 0; + + arch_spin_lock(&sb->lock); + + if (state->act_server == server && state->act_priority == priority && + state->eisn == eisn) + goto unlock; + + pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n", + priority, server, masked, state->act_server, + state->act_priority); + + kvmppc_xive_select_irq(state, &hw_num, NULL); + + if (priority != MASKED && !masked) { + rc = kvmppc_xive_select_target(kvm, &server, priority); + if (rc) + goto unlock; + + state->act_priority = priority; + state->act_server = server; + state->eisn = eisn; + + rc = xive_native_configure_irq(hw_num, + kvmppc_xive_vp(xive, server), + priority, eisn); + } else { + state->act_priority = MASKED; + state->act_server = 0; + state->eisn = 0; + + rc = xive_native_configure_irq(hw_num, 0, MASKED, 0); + } + +unlock: + arch_spin_unlock(&sb->lock); + return rc; +} + +static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive, + long irq, u64 addr) +{ + struct kvmppc_xive_src_block *sb; + struct kvmppc_xive_irq_state *state; + u64 __user *ubufp = (u64 __user *) addr; + u16 src; + u64 kvm_cfg; + u32 server; + u8 priority; + bool masked; + u32 eisn; + + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) + return -ENOENT; + + state = &sb->irq_state[src]; + + if (!state->valid) + return -EINVAL; + + if (get_user(kvm_cfg, ubufp)) + return -EFAULT; + + pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg); + + priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >> + KVM_XIVE_SOURCE_PRIORITY_SHIFT; + server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >> + KVM_XIVE_SOURCE_SERVER_SHIFT; + masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >> + KVM_XIVE_SOURCE_MASKED_SHIFT; + eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >> + KVM_XIVE_SOURCE_EISN_SHIFT; + + if (priority != xive_prio_from_guest(priority)) { + pr_err("invalid priority for queue %d for VCPU %d\n", + priority, server); + return -EINVAL; + } + + return kvmppc_xive_native_update_source_config(xive, sb, state, server, + priority, masked, eisn); +} + +static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive, + long irq, u64 addr) +{ + struct kvmppc_xive_src_block *sb; + struct kvmppc_xive_irq_state *state; + struct xive_irq_data *xd; + u32 hw_num; + u16 src; + int rc = 0; + + pr_devel("%s irq=0x%lx", __func__, irq); + + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) + return -ENOENT; + + state = &sb->irq_state[src]; + + rc = -EINVAL; + + arch_spin_lock(&sb->lock); + + if (state->valid) { + kvmppc_xive_select_irq(state, &hw_num, &xd); + xive_native_sync_source(hw_num); + rc = 0; + } + + arch_spin_unlock(&sb->lock); + return rc; +} + +static int xive_native_validate_queue_size(u32 qshift) +{ + /* + * We only support 64K pages for the moment. This is also + * advertised in the DT property "ibm,xive-eq-sizes" + */ + switch (qshift) { + case 0: /* EQ reset */ + case 16: + return 0; + case 12: + case 21: + case 24: + default: + return -EINVAL; + } +} + +static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, + long eq_idx, u64 addr) +{ + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + struct kvmppc_xive_vcpu *xc; + void __user *ubufp = (void __user *) addr; + u32 server; + u8 priority; + struct kvm_ppc_xive_eq kvm_eq; + int rc; + __be32 *qaddr = 0; + struct page *page; + struct xive_q *q; + gfn_t gfn; + unsigned long page_size; + + /* + * Demangle priority/server tuple from the EQ identifier + */ + priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >> + KVM_XIVE_EQ_PRIORITY_SHIFT; + server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >> + KVM_XIVE_EQ_SERVER_SHIFT; + + if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq))) + return -EFAULT; + + vcpu = kvmppc_xive_find_server(kvm, server); + if (!vcpu) { + pr_err("Can't find server %d\n", server); + return -ENOENT; + } + xc = vcpu->arch.xive_vcpu; + + if (priority != xive_prio_from_guest(priority)) { + pr_err("Trying to restore invalid queue %d for VCPU %d\n", + priority, server); + return -EINVAL; + } + q = &xc->queues[priority]; + + pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n", + __func__, server, priority, kvm_eq.flags, + kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex); + + /* + * sPAPR specifies a "Unconditional Notify (n) flag" for the + * H_INT_SET_QUEUE_CONFIG hcall which forces notification + * without using the coalescing mechanisms provided by the + * XIVE END ESBs. This is required on KVM as notification + * using the END ESBs is not supported. + */ + if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) { + pr_err("invalid flags %d\n", kvm_eq.flags); + return -EINVAL; + } + + rc = xive_native_validate_queue_size(kvm_eq.qshift); + if (rc) { + pr_err("invalid queue size %d\n", kvm_eq.qshift); + return rc; + } + + /* reset queue and disable queueing */ + if (!kvm_eq.qshift) { + q->guest_qaddr = 0; + q->guest_qshift = 0; + + rc = xive_native_configure_queue(xc->vp_id, q, priority, + NULL, 0, true); + if (rc) { + pr_err("Failed to reset queue %d for VCPU %d: %d\n", + priority, xc->server_num, rc); + return rc; + } + + if (q->qpage) { + put_page(virt_to_page(q->qpage)); + q->qpage = NULL; + } + + return 0; + } + + if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) { + pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr, + 1ull << kvm_eq.qshift); + return -EINVAL; + } + + gfn = gpa_to_gfn(kvm_eq.qaddr); + page = gfn_to_page(kvm, gfn); + if (is_error_page(page)) { + pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); + return -EINVAL; + } + + page_size = kvm_host_page_size(kvm, gfn); + if (1ull << kvm_eq.qshift > page_size) { + pr_warn("Incompatible host page size %lx!\n", page_size); + return -EINVAL; + } + + qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK); + + /* + * Backup the queue page guest address to the mark EQ page + * dirty for migration. + */ + q->guest_qaddr = kvm_eq.qaddr; + q->guest_qshift = kvm_eq.qshift; + + /* + * Unconditional Notification is forced by default at the + * OPAL level because the use of END ESBs is not supported by + * Linux. + */ + rc = xive_native_configure_queue(xc->vp_id, q, priority, + (__be32 *) qaddr, kvm_eq.qshift, true); + if (rc) { + pr_err("Failed to configure queue %d for VCPU %d: %d\n", + priority, xc->server_num, rc); + put_page(page); + return rc; + } + + /* + * Only restore the queue state when needed. When doing the + * H_INT_SET_SOURCE_CONFIG hcall, it should not. + */ + if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) { + rc = xive_native_set_queue_state(xc->vp_id, priority, + kvm_eq.qtoggle, + kvm_eq.qindex); + if (rc) + goto error; + } + + rc = kvmppc_xive_attach_escalation(vcpu, priority, + xive->single_escalation); +error: + if (rc) + kvmppc_xive_native_cleanup_queue(vcpu, priority); + return rc; +} + +static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive, + long eq_idx, u64 addr) +{ + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + struct kvmppc_xive_vcpu *xc; + struct xive_q *q; + void __user *ubufp = (u64 __user *) addr; + u32 server; + u8 priority; + struct kvm_ppc_xive_eq kvm_eq; + u64 qaddr; + u64 qshift; + u64 qeoi_page; + u32 escalate_irq; + u64 qflags; + int rc; + + /* + * Demangle priority/server tuple from the EQ identifier + */ + priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >> + KVM_XIVE_EQ_PRIORITY_SHIFT; + server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >> + KVM_XIVE_EQ_SERVER_SHIFT; + + vcpu = kvmppc_xive_find_server(kvm, server); + if (!vcpu) { + pr_err("Can't find server %d\n", server); + return -ENOENT; + } + xc = vcpu->arch.xive_vcpu; + + if (priority != xive_prio_from_guest(priority)) { + pr_err("invalid priority for queue %d for VCPU %d\n", + priority, server); + return -EINVAL; + } + q = &xc->queues[priority]; + + memset(&kvm_eq, 0, sizeof(kvm_eq)); + + if (!q->qpage) + return 0; + + rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift, + &qeoi_page, &escalate_irq, &qflags); + if (rc) + return rc; + + kvm_eq.flags = 0; + if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY) + kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY; + + kvm_eq.qshift = q->guest_qshift; + kvm_eq.qaddr = q->guest_qaddr; + + rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle, + &kvm_eq.qindex); + if (rc) + return rc; + + pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n", + __func__, server, priority, kvm_eq.flags, + kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex); + + if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq))) + return -EFAULT; + + return 0; +} + +static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb) +{ + int i; + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; + + if (!state->valid) + continue; + + if (state->act_priority == MASKED) + continue; + + state->eisn = 0; + state->act_server = 0; + state->act_priority = MASKED; + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); + xive_native_configure_irq(state->ipi_number, 0, MASKED, 0); + if (state->pt_number) { + xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01); + xive_native_configure_irq(state->pt_number, + 0, MASKED, 0); + } + } +} + +static int kvmppc_xive_reset(struct kvmppc_xive *xive) +{ + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + unsigned int i; + + pr_devel("%s\n", __func__); + + mutex_lock(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + unsigned int prio; + + if (!xc) + continue; + + kvmppc_xive_disable_vcpu_interrupts(vcpu); + + for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { + + /* Single escalation, no queue 7 */ + if (prio == 7 && xive->single_escalation) + break; + + if (xc->esc_virq[prio]) { + free_irq(xc->esc_virq[prio], vcpu); + irq_dispose_mapping(xc->esc_virq[prio]); + kfree(xc->esc_virq_names[prio]); + xc->esc_virq[prio] = 0; + } + + kvmppc_xive_native_cleanup_queue(vcpu, prio); + } + } + + for (i = 0; i <= xive->max_sbid; i++) { + struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; + + if (sb) { + arch_spin_lock(&sb->lock); + kvmppc_xive_reset_sources(sb); + arch_spin_unlock(&sb->lock); + } + } + + mutex_unlock(&kvm->lock); + + return 0; +} + +static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb) +{ + int j; + + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) { + struct kvmppc_xive_irq_state *state = &sb->irq_state[j]; + struct xive_irq_data *xd; + u32 hw_num; + + if (!state->valid) + continue; + + /* + * The struct kvmppc_xive_irq_state reflects the state + * of the EAS configuration and not the state of the + * source. The source is masked setting the PQ bits to + * '-Q', which is what is being done before calling + * the KVM_DEV_XIVE_EQ_SYNC control. + * + * If a source EAS is configured, OPAL syncs the XIVE + * IC of the source and the XIVE IC of the previous + * target if any. + * + * So it should be fine ignoring MASKED sources as + * they have been synced already. + */ + if (state->act_priority == MASKED) + continue; + + kvmppc_xive_select_irq(state, &hw_num, &xd); + xive_native_sync_source(hw_num); + xive_native_sync_queue(hw_num); + } +} + +static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + unsigned int prio; + + if (!xc) + return -ENOENT; + + for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { + struct xive_q *q = &xc->queues[prio]; + + if (!q->qpage) + continue; + + /* Mark EQ page dirty for migration */ + mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr)); + } + return 0; +} + +static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) +{ + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + unsigned int i; + + pr_devel("%s\n", __func__); + + mutex_lock(&kvm->lock); + for (i = 0; i <= xive->max_sbid; i++) { + struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; + + if (sb) { + arch_spin_lock(&sb->lock); + kvmppc_xive_native_sync_sources(sb); + arch_spin_unlock(&sb->lock); + } + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvmppc_xive_native_vcpu_eq_sync(vcpu); + } + mutex_unlock(&kvm->lock); + + return 0; +} + +static int kvmppc_xive_native_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvmppc_xive *xive = dev->private; + + switch (attr->group) { + case KVM_DEV_XIVE_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_XIVE_RESET: + return kvmppc_xive_reset(xive); + case KVM_DEV_XIVE_EQ_SYNC: + return kvmppc_xive_native_eq_sync(xive); + } + break; + case KVM_DEV_XIVE_GRP_SOURCE: + return kvmppc_xive_native_set_source(xive, attr->attr, + attr->addr); + case KVM_DEV_XIVE_GRP_SOURCE_CONFIG: + return kvmppc_xive_native_set_source_config(xive, attr->attr, + attr->addr); + case KVM_DEV_XIVE_GRP_EQ_CONFIG: + return kvmppc_xive_native_set_queue_config(xive, attr->attr, + attr->addr); + case KVM_DEV_XIVE_GRP_SOURCE_SYNC: + return kvmppc_xive_native_sync_source(xive, attr->attr, + attr->addr); + } + return -ENXIO; +} + +static int kvmppc_xive_native_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvmppc_xive *xive = dev->private; + + switch (attr->group) { + case KVM_DEV_XIVE_GRP_EQ_CONFIG: + return kvmppc_xive_native_get_queue_config(xive, attr->attr, + attr->addr); + } + return -ENXIO; +} + +static int kvmppc_xive_native_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_XIVE_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_XIVE_RESET: + case KVM_DEV_XIVE_EQ_SYNC: + return 0; + } + break; + case KVM_DEV_XIVE_GRP_SOURCE: + case KVM_DEV_XIVE_GRP_SOURCE_CONFIG: + case KVM_DEV_XIVE_GRP_SOURCE_SYNC: + if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ && + attr->attr < KVMPPC_XIVE_NR_IRQS) + return 0; + break; + case KVM_DEV_XIVE_GRP_EQ_CONFIG: + return 0; + } + return -ENXIO; +} + +/* + * Called when device fd is closed + */ +static void kvmppc_xive_native_release(struct kvm_device *dev) +{ + struct kvmppc_xive *xive = dev->private; + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + int i; + int was_ready; + + debugfs_remove(xive->dentry); + + pr_devel("Releasing xive native device\n"); + + /* + * Clearing mmu_ready temporarily while holding kvm->lock + * is a way of ensuring that no vcpus can enter the guest + * until we drop kvm->lock. Doing kick_all_cpus_sync() + * ensures that any vcpu executing inside the guest has + * exited the guest. Once kick_all_cpus_sync() has finished, + * we know that no vcpu can be executing the XIVE push or + * pull code or accessing the XIVE MMIO regions. + * + * Since this is the device release function, we know that + * userspace does not have any open fd or mmap referring to + * the device. Therefore there can not be any of the + * device attribute set/get, mmap, or page fault functions + * being executed concurrently, and similarly, the + * connect_vcpu and set/clr_mapped functions also cannot + * be being executed. + */ + was_ready = kvm->arch.mmu_ready; + kvm->arch.mmu_ready = 0; + kick_all_cpus_sync(); + + /* + * We should clean up the vCPU interrupt presenters first. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + /* + * Take vcpu->mutex to ensure that no one_reg get/set ioctl + * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done. + */ + mutex_lock(&vcpu->mutex); + kvmppc_xive_native_cleanup_vcpu(vcpu); + mutex_unlock(&vcpu->mutex); + } + + kvm->arch.xive = NULL; + + for (i = 0; i <= xive->max_sbid; i++) { + if (xive->src_blocks[i]) + kvmppc_xive_free_sources(xive->src_blocks[i]); + kfree(xive->src_blocks[i]); + xive->src_blocks[i] = NULL; + } + + if (xive->vp_base != XIVE_INVALID_VP) + xive_native_free_vp_block(xive->vp_base); + + kvm->arch.mmu_ready = was_ready; + + /* + * A reference of the kvmppc_xive pointer is now kept under + * the xive_devices struct of the machine for reuse. It is + * freed when the VM is destroyed for now until we fix all the + * execution paths. + */ + + kfree(dev); +} + +/* + * Create a XIVE device. kvm->lock is held. + */ +static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) +{ + struct kvmppc_xive *xive; + struct kvm *kvm = dev->kvm; + int ret = 0; + + pr_devel("Creating xive native device\n"); + + if (kvm->arch.xive) + return -EEXIST; + + xive = kvmppc_xive_get_device(kvm, type); + if (!xive) + return -ENOMEM; + + dev->private = xive; + xive->dev = dev; + xive->kvm = kvm; + kvm->arch.xive = xive; + mutex_init(&xive->mapping_lock); + + /* + * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for + * a default. Getting the max number of CPUs the VM was + * configured with would improve our usage of the XIVE VP space. + */ + xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS); + pr_devel("VP_Base=%x\n", xive->vp_base); + + if (xive->vp_base == XIVE_INVALID_VP) + ret = -ENXIO; + + xive->single_escalation = xive_native_has_single_escalation(); + xive->ops = &kvmppc_xive_native_ops; + + if (ret) + kfree(xive); + + return ret; +} + +/* + * Interrupt Pending Buffer (IPB) offset + */ +#define TM_IPB_SHIFT 40 +#define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT) + +int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + u64 opal_state; + int rc; + + if (!kvmppc_xive_enabled(vcpu)) + return -EPERM; + + if (!xc) + return -ENOENT; + + /* Thread context registers. We only care about IPB and CPPR */ + val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01; + + /* Get the VP state from OPAL */ + rc = xive_native_get_vp_state(xc->vp_id, &opal_state); + if (rc) + return rc; + + /* + * Capture the backup of IPB register in the NVT structure and + * merge it in our KVM VP state. + */ + val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK); + + pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n", + __func__, + vcpu->arch.xive_saved_state.nsr, + vcpu->arch.xive_saved_state.cppr, + vcpu->arch.xive_saved_state.ipb, + vcpu->arch.xive_saved_state.pipr, + vcpu->arch.xive_saved_state.w01, + (u32) vcpu->arch.xive_cam_word, opal_state); + + return 0; +} + +int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; + + pr_devel("%s w01=%016llx vp=%016llx\n", __func__, + val->xive_timaval[0], val->xive_timaval[1]); + + if (!kvmppc_xive_enabled(vcpu)) + return -EPERM; + + if (!xc || !xive) + return -ENOENT; + + /* We can't update the state of a "pushed" VCPU */ + if (WARN_ON(vcpu->arch.xive_pushed)) + return -EBUSY; + + /* + * Restore the thread context registers. IPB and CPPR should + * be the only ones that matter. + */ + vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0]; + + /* + * There is no need to restore the XIVE internal state (IPB + * stored in the NVT) as the IPB register was merged in KVM VP + * state when captured. + */ + return 0; +} + +static int xive_native_debug_show(struct seq_file *m, void *private) +{ + struct kvmppc_xive *xive = m->private; + struct kvm *kvm = xive->kvm; + struct kvm_vcpu *vcpu; + unsigned int i; + + if (!kvm) + return 0; + + seq_puts(m, "=========\nVCPU state\n=========\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + + if (!xc) + continue; + + seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", + xc->server_num, + vcpu->arch.xive_saved_state.nsr, + vcpu->arch.xive_saved_state.cppr, + vcpu->arch.xive_saved_state.ipb, + vcpu->arch.xive_saved_state.pipr, + vcpu->arch.xive_saved_state.w01, + (u32) vcpu->arch.xive_cam_word); + + kvmppc_xive_debug_show_queues(m, vcpu); + } + + return 0; +} + +static int xive_native_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, xive_native_debug_show, inode->i_private); +} + +static const struct file_operations xive_native_debug_fops = { + .open = xive_native_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void xive_native_debugfs_init(struct kvmppc_xive *xive) +{ + char *name; + + name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive); + if (!name) { + pr_err("%s: no memory for name\n", __func__); + return; + } + + xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xive, &xive_native_debug_fops); + + pr_debug("%s: created %s\n", __func__, name); + kfree(name); +} + +static void kvmppc_xive_native_init(struct kvm_device *dev) +{ + struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private; + + /* Register some debug interfaces */ + xive_native_debugfs_init(xive); +} + +struct kvm_device_ops kvm_xive_native_ops = { + .name = "kvm-xive-native", + .create = kvmppc_xive_native_create, + .init = kvmppc_xive_native_init, + .release = kvmppc_xive_native_release, + .set_attr = kvmppc_xive_native_set_attr, + .get_attr = kvmppc_xive_native_get_attr, + .has_attr = kvmppc_xive_native_has_attr, + .mmap = kvmppc_xive_native_mmap, +}; + +void kvmppc_xive_native_init_module(void) +{ + ; +} + +void kvmppc_xive_native_exit_module(void) +{ + ; +} diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 033363d6e764..0737acfd17f1 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -130,24 +130,14 @@ static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc, */ prio = ffs(pending) - 1; - /* - * If the most favoured prio we found pending is less - * favored (or equal) than a pending IPI, we return - * the IPI instead. - * - * Note: If pending was 0 and mfrr is 0xff, we will - * not spurriously take an IPI because mfrr cannot - * then be smaller than cppr. - */ - if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { - prio = xc->mfrr; - hirq = XICS_IPI; - break; - } - /* Don't scan past the guest cppr */ - if (prio >= xc->cppr || prio > 7) + if (prio >= xc->cppr || prio > 7) { + if (xc->mfrr < xc->cppr) { + prio = xc->mfrr; + hirq = XICS_IPI; + } break; + } /* Grab queue and pointers */ q = &xc->queues[prio]; @@ -184,9 +174,12 @@ skip_ipi: * been set and another occurrence of the IPI will trigger. */ if (hirq == XICS_IPI || (prio == 0 && !qpage)) { - if (scan_type == scan_fetch) + if (scan_type == scan_fetch) { GLUE(X_PFX,source_eoi)(xc->vp_ipi, &xc->vp_ipi_data); + q->idx = idx; + q->toggle = toggle; + } /* Loop back on same queue with updated idx/toggle */ #ifdef XIVE_RUNTIME_CHECKS WARN_ON(hirq && hirq != XICS_IPI); @@ -199,32 +192,41 @@ skip_ipi: if (hirq == XICS_DUMMY) goto skip_ipi; - /* If fetching, update queue pointers */ - if (scan_type == scan_fetch) { - q->idx = idx; - q->toggle = toggle; - } - - /* Something found, stop searching */ - if (hirq) - break; - - /* Clear the pending bit on the now empty queue */ - pending &= ~(1 << prio); + /* Clear the pending bit if the queue is now empty */ + if (!hirq) { + pending &= ~(1 << prio); - /* - * Check if the queue count needs adjusting due to - * interrupts being moved away. - */ - if (atomic_read(&q->pending_count)) { - int p = atomic_xchg(&q->pending_count, 0); - if (p) { + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. + */ + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + if (p) { #ifdef XIVE_RUNTIME_CHECKS - WARN_ON(p > atomic_read(&q->count)); + WARN_ON(p > atomic_read(&q->count)); #endif - atomic_sub(p, &q->count); + atomic_sub(p, &q->count); + } } } + + /* + * If the most favoured prio we found pending is less + * favored (or equal) than a pending IPI, we return + * the IPI instead. + */ + if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { + prio = xc->mfrr; + hirq = XICS_IPI; + break; + } + + /* If fetching, update queue pointers */ + if (scan_type == scan_fetch) { + q->idx = idx; + q->toggle = toggle; + } } /* If we are just taking a "peek", do nothing else */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8885377ec3e0..3393b166817a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -570,6 +570,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_GET_CPU_CHAR: r = 1; break; +#ifdef CONFIG_KVM_XIVE + case KVM_CAP_PPC_IRQ_XIVE: + /* + * We need XIVE to be enabled on the platform (implies + * a POWER9 processor) and the PowerNV platform, as + * nested is not yet supported. + */ + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); + break; +#endif case KVM_CAP_PPC_ALLOC_HTAB: r = hv_enabled; @@ -644,9 +654,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = num_online_cpus(); break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; @@ -753,6 +760,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) else kvmppc_xics_free_icp(vcpu); break; + case KVMPPC_IRQ_XIVE: + kvmppc_xive_native_cleanup_vcpu(vcpu); + break; } kvmppc_core_vcpu_free(vcpu); @@ -1941,6 +1951,30 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } #endif /* CONFIG_KVM_XICS */ +#ifdef CONFIG_KVM_XIVE + case KVM_CAP_PPC_IRQ_XIVE: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -ENXIO; + if (!xive_enabled()) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_xive_native_connect_vcpu(dev, vcpu, + cap->args[1]); + + fdput(f); + break; + } +#endif /* CONFIG_KVM_XIVE */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_FWNMI: r = -EINVAL; diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index e27792d0b744..8366c2abeafc 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -539,7 +539,8 @@ _GLOBAL(flush_hash_pages) #ifdef CONFIG_SMP lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l - lwz r8,TASK_CPU(r2) + tophys (r8, r2) + lwz r8, TASK_CPU(r8) oris r8,r8,9 10: lwarx r0,0,r9 cmpi 0,r0,0 diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c5c9ff2d7afc..b5d92dc32844 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -556,7 +556,7 @@ static int __init add_huge_page_size(unsigned long long size) if (size <= PAGE_SIZE || !is_power_of_2(size)) return -EINVAL; - mmu_psize = check_and_get_huge_psize(size); + mmu_psize = check_and_get_huge_psize(shift); if (mmu_psize < 0) return -EINVAL; diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 0c037e933e55..7782201e5fe8 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -521,6 +521,9 @@ u32 xive_native_default_eq_shift(void) } EXPORT_SYMBOL_GPL(xive_native_default_eq_shift); +unsigned long xive_tima_os; +EXPORT_SYMBOL_GPL(xive_tima_os); + bool __init xive_native_init(void) { struct device_node *np; @@ -573,6 +576,14 @@ bool __init xive_native_init(void) for_each_possible_cpu(cpu) kvmppc_set_xive_tima(cpu, r.start, tima); + /* Resource 2 is OS window */ + if (of_address_to_resource(np, 2, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + return false; + } + + xive_tima_os = r.start; + /* Grab size of provisionning pages */ xive_parse_provisioning(np); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e66745decea1..ee32c66e1af3 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -27,7 +27,7 @@ config RISCV select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_SMP_IDLE_THREAD - select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A + select GENERIC_ATOMIC64 if !64BIT select HAVE_ARCH_AUDITSYSCALL select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_CONTIGUOUS @@ -35,7 +35,6 @@ config RISCV select HAVE_PERF_EVENTS select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN - select RISCV_ISA_A if SMP select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select HAVE_ARCH_TRACEHOOK @@ -195,9 +194,6 @@ config RISCV_ISA_C If you don't know what to do here, say Y. -config RISCV_ISA_A - def_bool y - menu "supported PMU type" depends on PERF_EVENTS diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index c6342e638ef7..6b0741c9f348 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -39,9 +39,8 @@ endif KBUILD_CFLAGS += -Wall # ISA string setting -riscv-march-$(CONFIG_ARCH_RV32I) := rv32im -riscv-march-$(CONFIG_ARCH_RV64I) := rv64im -riscv-march-$(CONFIG_RISCV_ISA_A) := $(riscv-march-y)a +riscv-march-$(CONFIG_ARCH_RV32I) := rv32ima +riscv-march-$(CONFIG_ARCH_RV64I) := rv64ima riscv-march-$(CONFIG_FPU) := $(riscv-march-y)fd riscv-march-$(CONFIG_RISCV_ISA_C) := $(riscv-march-y)c KBUILD_CFLAGS += -march=$(subst fd,,$(riscv-march-y)) diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index cccd12cf27d4..5a7a19d9aa7f 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += compat.h generic-y += cputime.h generic-y += device.h generic-y += div64.h +generic-y += extable.h generic-y += dma.h generic-y += dma-contiguous.h generic-y += dma-mapping.h diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index bfc7f099ab1f..52a1fbdeab3b 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -21,7 +21,12 @@ #include <asm/asm.h> #ifdef CONFIG_GENERIC_BUG -#define __BUG_INSN _AC(0x00100073, UL) /* ebreak */ +#define __INSN_LENGTH_MASK _UL(0x3) +#define __INSN_LENGTH_32 _UL(0x3) +#define __COMPRESSED_INSN_MASK _UL(0xffff) + +#define __BUG_INSN_32 _UL(0x00100073) /* ebreak */ +#define __BUG_INSN_16 _UL(0x9002) /* c.ebreak */ #ifndef __ASSEMBLY__ typedef u32 bug_insn_t; @@ -38,38 +43,46 @@ typedef u32 bug_insn_t; #define __BUG_ENTRY \ __BUG_ENTRY_ADDR "\n\t" \ __BUG_ENTRY_FILE "\n\t" \ - RISCV_SHORT " %1" + RISCV_SHORT " %1\n\t" \ + RISCV_SHORT " %2" #else #define __BUG_ENTRY \ - __BUG_ENTRY_ADDR + __BUG_ENTRY_ADDR "\n\t" \ + RISCV_SHORT " %2" #endif -#define BUG() \ +#define __BUG_FLAGS(flags) \ do { \ __asm__ __volatile__ ( \ "1:\n\t" \ "ebreak\n" \ - ".pushsection __bug_table,\"a\"\n\t" \ + ".pushsection __bug_table,\"aw\"\n\t" \ "2:\n\t" \ __BUG_ENTRY "\n\t" \ - ".org 2b + %2\n\t" \ + ".org 2b + %3\n\t" \ ".popsection" \ : \ : "i" (__FILE__), "i" (__LINE__), \ - "i" (sizeof(struct bug_entry))); \ - unreachable(); \ + "i" (flags), \ + "i" (sizeof(struct bug_entry))); \ } while (0) + #endif /* !__ASSEMBLY__ */ #else /* CONFIG_GENERIC_BUG */ #ifndef __ASSEMBLY__ -#define BUG() \ -do { \ +#define __BUG_FLAGS(flags) do { \ __asm__ __volatile__ ("ebreak\n"); \ - unreachable(); \ } while (0) #endif /* !__ASSEMBLY__ */ #endif /* CONFIG_GENERIC_BUG */ +#define BUG() do { \ + __BUG_FLAGS(0); \ + unreachable(); \ +} while (0) + +#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) + #define HAVE_ARCH_BUG #include <asm-generic/bug.h> diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8f13074413a7..1f4ba68ab9aa 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -47,7 +47,7 @@ static inline void flush_dcache_page(struct page *page) #else /* CONFIG_SMP */ -#define flush_icache_all() sbi_remote_fence_i(NULL) +void flush_icache_all(void); void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 28a0d1cb374c..3c3c26c3a1f1 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -14,64 +14,95 @@ #ifndef _ASM_RISCV_CSR_H #define _ASM_RISCV_CSR_H +#include <asm/asm.h> #include <linux/const.h> /* Status register flags */ -#define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ -#define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ -#define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ -#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ - -#define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ -#define SR_FS_OFF _AC(0x00000000, UL) -#define SR_FS_INITIAL _AC(0x00002000, UL) -#define SR_FS_CLEAN _AC(0x00004000, UL) -#define SR_FS_DIRTY _AC(0x00006000, UL) - -#define SR_XS _AC(0x00018000, UL) /* Extension Status */ -#define SR_XS_OFF _AC(0x00000000, UL) -#define SR_XS_INITIAL _AC(0x00008000, UL) -#define SR_XS_CLEAN _AC(0x00010000, UL) -#define SR_XS_DIRTY _AC(0x00018000, UL) +#define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ +#define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ +#define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ +#define SR_SUM _AC(0x00040000, UL) /* Supervisor User Memory Access */ + +#define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ +#define SR_FS_OFF _AC(0x00000000, UL) +#define SR_FS_INITIAL _AC(0x00002000, UL) +#define SR_FS_CLEAN _AC(0x00004000, UL) +#define SR_FS_DIRTY _AC(0x00006000, UL) + +#define SR_XS _AC(0x00018000, UL) /* Extension Status */ +#define SR_XS_OFF _AC(0x00000000, UL) +#define SR_XS_INITIAL _AC(0x00008000, UL) +#define SR_XS_CLEAN _AC(0x00010000, UL) +#define SR_XS_DIRTY _AC(0x00018000, UL) #ifndef CONFIG_64BIT -#define SR_SD _AC(0x80000000, UL) /* FS/XS dirty */ +#define SR_SD _AC(0x80000000, UL) /* FS/XS dirty */ #else -#define SR_SD _AC(0x8000000000000000, UL) /* FS/XS dirty */ +#define SR_SD _AC(0x8000000000000000, UL) /* FS/XS dirty */ #endif /* SATP flags */ -#if __riscv_xlen == 32 -#define SATP_PPN _AC(0x003FFFFF, UL) -#define SATP_MODE_32 _AC(0x80000000, UL) -#define SATP_MODE SATP_MODE_32 +#ifndef CONFIG_64BIT +#define SATP_PPN _AC(0x003FFFFF, UL) +#define SATP_MODE_32 _AC(0x80000000, UL) +#define SATP_MODE SATP_MODE_32 #else -#define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) -#define SATP_MODE_39 _AC(0x8000000000000000, UL) -#define SATP_MODE SATP_MODE_39 +#define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) +#define SATP_MODE_39 _AC(0x8000000000000000, UL) +#define SATP_MODE SATP_MODE_39 #endif -/* Interrupt Enable and Interrupt Pending flags */ -#define SIE_SSIE _AC(0x00000002, UL) /* Software Interrupt Enable */ -#define SIE_STIE _AC(0x00000020, UL) /* Timer Interrupt Enable */ -#define SIE_SEIE _AC(0x00000200, UL) /* External Interrupt Enable */ - -#define EXC_INST_MISALIGNED 0 -#define EXC_INST_ACCESS 1 -#define EXC_BREAKPOINT 3 -#define EXC_LOAD_ACCESS 5 -#define EXC_STORE_ACCESS 7 -#define EXC_SYSCALL 8 -#define EXC_INST_PAGE_FAULT 12 -#define EXC_LOAD_PAGE_FAULT 13 -#define EXC_STORE_PAGE_FAULT 15 +/* SCAUSE */ +#define SCAUSE_IRQ_FLAG (_AC(1, UL) << (__riscv_xlen - 1)) + +#define IRQ_U_SOFT 0 +#define IRQ_S_SOFT 1 +#define IRQ_M_SOFT 3 +#define IRQ_U_TIMER 4 +#define IRQ_S_TIMER 5 +#define IRQ_M_TIMER 7 +#define IRQ_U_EXT 8 +#define IRQ_S_EXT 9 +#define IRQ_M_EXT 11 + +#define EXC_INST_MISALIGNED 0 +#define EXC_INST_ACCESS 1 +#define EXC_BREAKPOINT 3 +#define EXC_LOAD_ACCESS 5 +#define EXC_STORE_ACCESS 7 +#define EXC_SYSCALL 8 +#define EXC_INST_PAGE_FAULT 12 +#define EXC_LOAD_PAGE_FAULT 13 +#define EXC_STORE_PAGE_FAULT 15 + +/* SIE (Interrupt Enable) and SIP (Interrupt Pending) flags */ +#define SIE_SSIE (_AC(0x1, UL) << IRQ_S_SOFT) +#define SIE_STIE (_AC(0x1, UL) << IRQ_S_TIMER) +#define SIE_SEIE (_AC(0x1, UL) << IRQ_S_EXT) + +#define CSR_CYCLE 0xc00 +#define CSR_TIME 0xc01 +#define CSR_INSTRET 0xc02 +#define CSR_SSTATUS 0x100 +#define CSR_SIE 0x104 +#define CSR_STVEC 0x105 +#define CSR_SCOUNTEREN 0x106 +#define CSR_SSCRATCH 0x140 +#define CSR_SEPC 0x141 +#define CSR_SCAUSE 0x142 +#define CSR_STVAL 0x143 +#define CSR_SIP 0x144 +#define CSR_SATP 0x180 +#define CSR_CYCLEH 0xc80 +#define CSR_TIMEH 0xc81 +#define CSR_INSTRETH 0xc82 #ifndef __ASSEMBLY__ #define csr_swap(csr, val) \ ({ \ unsigned long __v = (unsigned long)(val); \ - __asm__ __volatile__ ("csrrw %0, " #csr ", %1" \ + __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1"\ : "=r" (__v) : "rK" (__v) \ : "memory"); \ __v; \ @@ -80,7 +111,7 @@ #define csr_read(csr) \ ({ \ register unsigned long __v; \ - __asm__ __volatile__ ("csrr %0, " #csr \ + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) \ : "=r" (__v) : \ : "memory"); \ __v; \ @@ -89,7 +120,7 @@ #define csr_write(csr, val) \ ({ \ unsigned long __v = (unsigned long)(val); \ - __asm__ __volatile__ ("csrw " #csr ", %0" \ + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" \ : : "rK" (__v) \ : "memory"); \ }) @@ -97,7 +128,7 @@ #define csr_read_set(csr, val) \ ({ \ unsigned long __v = (unsigned long)(val); \ - __asm__ __volatile__ ("csrrs %0, " #csr ", %1" \ + __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1"\ : "=r" (__v) : "rK" (__v) \ : "memory"); \ __v; \ @@ -106,7 +137,7 @@ #define csr_set(csr, val) \ ({ \ unsigned long __v = (unsigned long)(val); \ - __asm__ __volatile__ ("csrs " #csr ", %0" \ + __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" \ : : "rK" (__v) \ : "memory"); \ }) @@ -114,7 +145,7 @@ #define csr_read_clear(csr, val) \ ({ \ unsigned long __v = (unsigned long)(val); \ - __asm__ __volatile__ ("csrrc %0, " #csr ", %1" \ + __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1"\ : "=r" (__v) : "rK" (__v) \ : "memory"); \ __v; \ @@ -123,7 +154,7 @@ #define csr_clear(csr, val) \ ({ \ unsigned long __v = (unsigned long)(val); \ - __asm__ __volatile__ ("csrc " #csr ", %0" \ + __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" \ : : "rK" (__v) \ : "memory"); \ }) diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 697fc23b0d5a..ce0cd7d77eb0 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -27,13 +27,7 @@ #define ELF_CLASS ELFCLASS32 #endif -#if defined(__LITTLE_ENDIAN) #define ELF_DATA ELFDATA2LSB -#elif defined(__BIG_ENDIAN) -#define ELF_DATA ELFDATA2MSB -#else -#error "Unknown endianness" -#endif /* * This is used to ensure we don't load something for the wrong architecture. diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 66641624d8a5..4ad6409c4647 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -7,18 +7,6 @@ #ifndef _ASM_FUTEX_H #define _ASM_FUTEX_H -#ifndef CONFIG_RISCV_ISA_A -/* - * Use the generic interrupt disabling versions if the A extension - * is not supported. - */ -#ifdef CONFIG_SMP -#error "Can't support generic futex calls without A extension on SMP" -#endif -#include <asm-generic/futex.h> - -#else /* CONFIG_RISCV_ISA_A */ - #include <linux/futex.h> #include <linux/uaccess.h> #include <linux/errno.h> @@ -124,5 +112,4 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return ret; } -#endif /* CONFIG_RISCV_ISA_A */ #endif /* _ASM_FUTEX_H */ diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h index 07a3c6d5706f..1a69b3bcd371 100644 --- a/arch/riscv/include/asm/irqflags.h +++ b/arch/riscv/include/asm/irqflags.h @@ -21,25 +21,25 @@ /* read interrupt enabled status */ static inline unsigned long arch_local_save_flags(void) { - return csr_read(sstatus); + return csr_read(CSR_SSTATUS); } /* unconditionally enable interrupts */ static inline void arch_local_irq_enable(void) { - csr_set(sstatus, SR_SIE); + csr_set(CSR_SSTATUS, SR_SIE); } /* unconditionally disable interrupts */ static inline void arch_local_irq_disable(void) { - csr_clear(sstatus, SR_SIE); + csr_clear(CSR_SSTATUS, SR_SIE); } /* get status and disable interrupts */ static inline unsigned long arch_local_irq_save(void) { - return csr_read_clear(sstatus, SR_SIE); + return csr_read_clear(CSR_SSTATUS, SR_SIE); } /* test flags */ @@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void) /* set interrupt enabled status */ static inline void arch_local_irq_restore(unsigned long flags) { - csr_set(sstatus, flags & SR_SIE); + csr_set(CSR_SSTATUS, flags & SR_SIE); } #endif /* _ASM_RISCV_IRQFLAGS_H */ diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h index 336d60ec5698..bf4f097a9051 100644 --- a/arch/riscv/include/asm/mmu_context.h +++ b/arch/riscv/include/asm/mmu_context.h @@ -20,8 +20,6 @@ #include <linux/mm.h> #include <linux/sched.h> -#include <asm/tlbflush.h> -#include <asm/cacheflush.h> static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *task) @@ -39,61 +37,8 @@ static inline void destroy_context(struct mm_struct *mm) { } -/* - * When necessary, performs a deferred icache flush for the given MM context, - * on the local CPU. RISC-V has no direct mechanism for instruction cache - * shoot downs, so instead we send an IPI that informs the remote harts they - * need to flush their local instruction caches. To avoid pathologically slow - * behavior in a common case (a bunch of single-hart processes on a many-hart - * machine, ie 'make -j') we avoid the IPIs for harts that are not currently - * executing a MM context and instead schedule a deferred local instruction - * cache flush to be performed before execution resumes on each hart. This - * actually performs that local instruction cache flush, which implicitly only - * refers to the current hart. - */ -static inline void flush_icache_deferred(struct mm_struct *mm) -{ -#ifdef CONFIG_SMP - unsigned int cpu = smp_processor_id(); - cpumask_t *mask = &mm->context.icache_stale_mask; - - if (cpumask_test_cpu(cpu, mask)) { - cpumask_clear_cpu(cpu, mask); - /* - * Ensure the remote hart's writes are visible to this hart. - * This pairs with a barrier in flush_icache_mm. - */ - smp_mb(); - local_flush_icache_all(); - } -#endif -} - -static inline void switch_mm(struct mm_struct *prev, - struct mm_struct *next, struct task_struct *task) -{ - if (likely(prev != next)) { - /* - * Mark the current MM context as inactive, and the next as - * active. This is at least used by the icache flushing - * routines in order to determine who should - */ - unsigned int cpu = smp_processor_id(); - - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Use the old spbtr name instead of using the current satp - * name to support binutils 2.29 which doesn't know about the - * privileged ISA 1.10 yet. - */ - csr_write(sptbr, virt_to_pfn(next->pgd) | SATP_MODE); - local_flush_tlb_all(); - - flush_icache_deferred(next); - } -} +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *task); static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index d35ec2f41381..9c867a4bac83 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -70,47 +70,38 @@ struct pt_regs { /* Helpers for working with the instruction pointer */ -#define GET_IP(regs) ((regs)->sepc) -#define SET_IP(regs, val) (GET_IP(regs) = (val)) - static inline unsigned long instruction_pointer(struct pt_regs *regs) { - return GET_IP(regs); + return regs->sepc; } static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { - SET_IP(regs, val); + regs->sepc = val; } #define profile_pc(regs) instruction_pointer(regs) /* Helpers for working with the user stack pointer */ -#define GET_USP(regs) ((regs)->sp) -#define SET_USP(regs, val) (GET_USP(regs) = (val)) - static inline unsigned long user_stack_pointer(struct pt_regs *regs) { - return GET_USP(regs); + return regs->sp; } static inline void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) { - SET_USP(regs, val); + regs->sp = val; } /* Helpers for working with the frame pointer */ -#define GET_FP(regs) ((regs)->s0) -#define SET_FP(regs, val) (GET_FP(regs) = (val)) - static inline unsigned long frame_pointer(struct pt_regs *regs) { - return GET_FP(regs); + return regs->s0; } static inline void frame_pointer_set(struct pt_regs *regs, unsigned long val) { - SET_FP(regs, val); + regs->s0 = val; } static inline unsigned long regs_return_value(struct pt_regs *regs) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index b6bb10b92fe2..19f231615510 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -26,22 +26,27 @@ #define SBI_REMOTE_SFENCE_VMA_ASID 7 #define SBI_SHUTDOWN 8 -#define SBI_CALL(which, arg0, arg1, arg2) ({ \ +#define SBI_CALL(which, arg0, arg1, arg2, arg3) ({ \ register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); \ register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); \ register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); \ + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); \ register uintptr_t a7 asm ("a7") = (uintptr_t)(which); \ asm volatile ("ecall" \ : "+r" (a0) \ - : "r" (a1), "r" (a2), "r" (a7) \ + : "r" (a1), "r" (a2), "r" (a3), "r" (a7) \ : "memory"); \ a0; \ }) /* Lazy implementations until SBI is finalized */ -#define SBI_CALL_0(which) SBI_CALL(which, 0, 0, 0) -#define SBI_CALL_1(which, arg0) SBI_CALL(which, arg0, 0, 0) -#define SBI_CALL_2(which, arg0, arg1) SBI_CALL(which, arg0, arg1, 0) +#define SBI_CALL_0(which) SBI_CALL(which, 0, 0, 0, 0) +#define SBI_CALL_1(which, arg0) SBI_CALL(which, arg0, 0, 0, 0) +#define SBI_CALL_2(which, arg0, arg1) SBI_CALL(which, arg0, arg1, 0, 0) +#define SBI_CALL_3(which, arg0, arg1, arg2) \ + SBI_CALL(which, arg0, arg1, arg2, 0) +#define SBI_CALL_4(which, arg0, arg1, arg2, arg3) \ + SBI_CALL(which, arg0, arg1, arg2, arg3) static inline void sbi_console_putchar(int ch) { @@ -86,7 +91,7 @@ static inline void sbi_remote_sfence_vma(const unsigned long *hart_mask, unsigned long start, unsigned long size) { - SBI_CALL_1(SBI_REMOTE_SFENCE_VMA, hart_mask); + SBI_CALL_3(SBI_REMOTE_SFENCE_VMA, hart_mask, start, size); } static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, @@ -94,7 +99,7 @@ static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, unsigned long size, unsigned long asid) { - SBI_CALL_1(SBI_REMOTE_SFENCE_VMA_ASID, hart_mask); + SBI_CALL_4(SBI_REMOTE_SFENCE_VMA_ASID, hart_mask, start, size, asid); } #endif diff --git a/arch/riscv/include/asm/sifive_l2_cache.h b/arch/riscv/include/asm/sifive_l2_cache.h new file mode 100644 index 000000000000..04f6748fc50b --- /dev/null +++ b/arch/riscv/include/asm/sifive_l2_cache.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * SiFive L2 Cache Controller header file + * + */ + +#ifndef _ASM_RISCV_SIFIVE_L2_CACHE_H +#define _ASM_RISCV_SIFIVE_L2_CACHE_H + +extern int register_sifive_l2_error_notifier(struct notifier_block *nb); +extern int unregister_sifive_l2_error_notifier(struct notifier_block *nb); + +#define SIFIVE_L2_ERR_TYPE_CE 0 +#define SIFIVE_L2_ERR_TYPE_UE 1 + +#endif /* _ASM_RISCV_SIFIVE_L2_CACHE_H */ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 1c9cc8389928..9c039870019b 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -28,7 +28,9 @@ #include <asm/processor.h> #include <asm/csr.h> -typedef unsigned long mm_segment_t; +typedef struct { + unsigned long seg; +} mm_segment_t; /* * low level task data that entry.S needs immediate access to diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index fb53a8089e76..b26f407be5c8 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -23,6 +23,7 @@ #include <linux/compiler.h> #include <linux/thread_info.h> #include <asm/byteorder.h> +#include <asm/extable.h> #include <asm/asm.h> #define __enable_user_access() \ @@ -38,8 +39,10 @@ * For historical reasons, these macros are grossly misnamed. */ -#define KERNEL_DS (~0UL) -#define USER_DS (TASK_SIZE) +#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) + +#define KERNEL_DS MAKE_MM_SEG(~0UL) +#define USER_DS MAKE_MM_SEG(TASK_SIZE) #define get_fs() (current_thread_info()->addr_limit) @@ -48,9 +51,9 @@ static inline void set_fs(mm_segment_t fs) current_thread_info()->addr_limit = fs; } -#define segment_eq(a, b) ((a) == (b)) +#define segment_eq(a, b) ((a).seg == (b).seg) -#define user_addr_max() (get_fs()) +#define user_addr_max() (get_fs().seg) /** @@ -82,7 +85,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size) { const mm_segment_t fs = get_fs(); - return (size <= fs) && (addr <= (fs - size)); + return size <= fs.seg && addr <= fs.seg - size; } /* @@ -98,21 +101,8 @@ static inline int __access_ok(unsigned long addr, unsigned long size) * on our cache or tlb entries. */ -struct exception_table_entry { - unsigned long insn, fixup; -}; - -extern int fixup_exception(struct pt_regs *state); - -#if defined(__LITTLE_ENDIAN) -#define __MSW 1 #define __LSW 0 -#elif defined(__BIG_ENDIAN) -#define __MSW 0 -#define __LSW 1 -#else -#error "Unknown endianness" -#endif +#define __MSW 1 /* * The "__xxx" versions of the user access functions do not verify the address diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index dac98348c6a3..578bb5efc085 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -312,9 +312,6 @@ void asm_offsets(void) - offsetof(struct task_struct, thread.fstate.f[0]) ); - /* The assembler needs access to THREAD_SIZE as well. */ - DEFINE(ASM_THREAD_SIZE, THREAD_SIZE); - /* * We allocate a pt_regs on the stack when entering the kernel. This * ensures the alignment is sane. diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index cf2fca12414a..c8d2a3223099 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -136,8 +136,7 @@ static void c_stop(struct seq_file *m, void *v) static int c_show(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; - struct device_node *node = of_get_cpu_node(cpuid_to_hartid_map(cpu_id), - NULL); + struct device_node *node = of_get_cpu_node(cpu_id, NULL); const char *compat, *isa, *mmu; seq_printf(m, "processor\t: %lu\n", cpu_id); diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index fd9b57c8b4ce..1c1ecc238cfa 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -37,11 +37,11 @@ * the kernel thread pointer. If we came from the kernel, sscratch * will contain 0, and we should continue on the current TP. */ - csrrw tp, sscratch, tp + csrrw tp, CSR_SSCRATCH, tp bnez tp, _save_context _restore_kernel_tpsp: - csrr tp, sscratch + csrr tp, CSR_SSCRATCH REG_S sp, TASK_TI_KERNEL_SP(tp) _save_context: REG_S sp, TASK_TI_USER_SP(tp) @@ -87,11 +87,11 @@ _save_context: li t0, SR_SUM | SR_FS REG_L s0, TASK_TI_USER_SP(tp) - csrrc s1, sstatus, t0 - csrr s2, sepc - csrr s3, sbadaddr - csrr s4, scause - csrr s5, sscratch + csrrc s1, CSR_SSTATUS, t0 + csrr s2, CSR_SEPC + csrr s3, CSR_STVAL + csrr s4, CSR_SCAUSE + csrr s5, CSR_SSCRATCH REG_S s0, PT_SP(sp) REG_S s1, PT_SSTATUS(sp) REG_S s2, PT_SEPC(sp) @@ -107,8 +107,8 @@ _save_context: .macro RESTORE_ALL REG_L a0, PT_SSTATUS(sp) REG_L a2, PT_SEPC(sp) - csrw sstatus, a0 - csrw sepc, a2 + csrw CSR_SSTATUS, a0 + csrw CSR_SEPC, a2 REG_L x1, PT_RA(sp) REG_L x3, PT_GP(sp) @@ -155,7 +155,7 @@ ENTRY(handle_exception) * Set sscratch register to 0, so that if a recursive exception * occurs, the exception vector knows it came from the kernel */ - csrw sscratch, x0 + csrw CSR_SSCRATCH, x0 /* Load the global pointer */ .option push @@ -248,7 +248,7 @@ resume_userspace: * Save TP into sscratch, so we can find the kernel data structures * again. */ - csrw sscratch, tp + csrw CSR_SSCRATCH, tp restore_all: RESTORE_ALL diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index fe884cd69abd..370c66ce187a 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -23,7 +23,8 @@ __INIT ENTRY(_start) /* Mask all interrupts */ - csrw sie, zero + csrw CSR_SIE, zero + csrw CSR_SIP, zero /* Load the global pointer */ .option push @@ -68,14 +69,10 @@ clear_bss_done: /* Restore C environment */ la tp, init_task sw zero, TASK_TI_CPU(tp) - - la sp, init_thread_union - li a0, ASM_THREAD_SIZE - add sp, sp, a0 + la sp, init_thread_union + THREAD_SIZE /* Start the kernel */ - mv a0, s0 - mv a1, s1 + mv a0, s1 call parse_dtb tail start_kernel @@ -89,7 +86,7 @@ relocate: /* Point stvec to virtual address of intruction after satp write */ la a0, 1f add a0, a0, a1 - csrw stvec, a0 + csrw CSR_STVEC, a0 /* Compute satp for kernel page tables, but don't load it yet */ la a2, swapper_pg_dir @@ -99,18 +96,20 @@ relocate: /* * Load trampoline page directory, which will cause us to trap to - * stvec if VA != PA, or simply fall through if VA == PA + * stvec if VA != PA, or simply fall through if VA == PA. We need a + * full fence here because setup_vm() just wrote these PTEs and we need + * to ensure the new translations are in use. */ la a0, trampoline_pg_dir srl a0, a0, PAGE_SHIFT or a0, a0, a1 sfence.vma - csrw sptbr, a0 + csrw CSR_SATP, a0 .align 2 1: /* Set trap vector to spin forever to help debug */ la a0, .Lsecondary_park - csrw stvec, a0 + csrw CSR_STVEC, a0 /* Reload the global pointer */ .option push @@ -118,8 +117,14 @@ relocate: la gp, __global_pointer$ .option pop - /* Switch to kernel page tables */ - csrw sptbr, a2 + /* + * Switch to kernel page tables. A full fence is necessary in order to + * avoid using the trampoline translations, which are only correct for + * the first superpage. Fetching the fence is guarnteed to work + * because that first superpage is translated the same way. + */ + csrw CSR_SATP, a2 + sfence.vma ret @@ -130,7 +135,7 @@ relocate: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park - csrw stvec, a3 + csrw CSR_STVEC, a3 slli a3, a0, LGREG la a1, __cpu_up_stack_pointer diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c index 48e6b7db83a1..6d8659388c49 100644 --- a/arch/riscv/kernel/irq.c +++ b/arch/riscv/kernel/irq.c @@ -14,17 +14,9 @@ /* * Possible interrupt causes: */ -#define INTERRUPT_CAUSE_SOFTWARE 1 -#define INTERRUPT_CAUSE_TIMER 5 -#define INTERRUPT_CAUSE_EXTERNAL 9 - -/* - * The high order bit of the trap cause register is always set for - * interrupts, which allows us to differentiate them from exceptions - * quickly. The INTERRUPT_CAUSE_* macros don't contain that bit, so we - * need to mask it off. - */ -#define INTERRUPT_CAUSE_FLAG (1UL << (__riscv_xlen - 1)) +#define INTERRUPT_CAUSE_SOFTWARE IRQ_S_SOFT +#define INTERRUPT_CAUSE_TIMER IRQ_S_TIMER +#define INTERRUPT_CAUSE_EXTERNAL IRQ_S_EXT int arch_show_interrupts(struct seq_file *p, int prec) { @@ -37,7 +29,7 @@ asmlinkage void __irq_entry do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); irq_enter(); - switch (regs->scause & ~INTERRUPT_CAUSE_FLAG) { + switch (regs->scause & ~SCAUSE_IRQ_FLAG) { case INTERRUPT_CAUSE_TIMER: riscv_timer_interrupt(); break; @@ -54,7 +46,8 @@ asmlinkage void __irq_entry do_IRQ(struct pt_regs *regs) handle_arch_irq(regs); break; default: - panic("unexpected interrupt cause"); + pr_alert("unexpected interrupt cause 0x%lx", regs->scause); + BUG(); } irq_exit(); diff --git a/arch/riscv/kernel/perf_event.c b/arch/riscv/kernel/perf_event.c index 667ee70defea..91626d9ae5f2 100644 --- a/arch/riscv/kernel/perf_event.c +++ b/arch/riscv/kernel/perf_event.c @@ -185,10 +185,10 @@ static inline u64 read_counter(int idx) switch (idx) { case RISCV_PMU_CYCLE: - val = csr_read(cycle); + val = csr_read(CSR_CYCLE); break; case RISCV_PMU_INSTRET: - val = csr_read(instret); + val = csr_read(CSR_INSTRET); break; default: WARN_ON_ONCE(idx < 0 || idx > RISCV_MAX_COUNTERS); diff --git a/arch/riscv/kernel/reset.c b/arch/riscv/kernel/reset.c index 2a53d26ffdd6..ed637aee514b 100644 --- a/arch/riscv/kernel/reset.c +++ b/arch/riscv/kernel/reset.c @@ -12,11 +12,15 @@ */ #include <linux/reboot.h> -#include <linux/export.h> #include <asm/sbi.h> -void (*pm_power_off)(void) = machine_power_off; -EXPORT_SYMBOL(pm_power_off); +static void default_power_off(void) +{ + sbi_shutdown(); + while (1); +} + +void (*pm_power_off)(void) = default_power_off; void machine_restart(char *cmd) { @@ -26,11 +30,10 @@ void machine_restart(char *cmd) void machine_halt(void) { - machine_power_off(); + pm_power_off(); } void machine_power_off(void) { - sbi_shutdown(); - while (1); + pm_power_off(); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 540a331d1376..d93bcce004e3 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -52,9 +52,11 @@ struct screen_info screen_info = { atomic_t hart_lottery; unsigned long boot_cpu_hartid; -void __init parse_dtb(unsigned int hartid, void *dtb) +void __init parse_dtb(phys_addr_t dtb_phys) { - if (early_init_dt_scan(__va(dtb))) + void *dtb = __va(dtb_phys); + + if (early_init_dt_scan(dtb)) return; pr_err("No DTB passed to the kernel\n"); diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 837e1646091a..804d6ee4f3c5 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -234,6 +234,9 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* Are we from a system call? */ if (regs->scause == EXC_SYSCALL) { + /* Avoid additional syscall restarting via ret_from_exception */ + regs->scause = -1UL; + /* If so, check system call restarting.. */ switch (regs->a0) { case -ERESTART_RESTARTBLOCK: @@ -272,6 +275,9 @@ static void do_signal(struct pt_regs *regs) /* Did we come from a system call? */ if (regs->scause == EXC_SYSCALL) { + /* Avoid additional syscall restarting via ret_from_exception */ + regs->scause = -1UL; + /* Restart the system call - no handlers present */ switch (regs->a0) { case -ERESTARTNOHAND: diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 0c41d07ec281..b2537ffa855c 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -42,7 +42,7 @@ unsigned long __cpuid_to_hartid_map[NR_CPUS] = { void __init smp_setup_processor_id(void) { - cpuid_to_hartid_map(0) = boot_cpu_hartid; + cpuid_to_hartid_map(0) = boot_cpu_hartid; } /* A collection of single bit ipi messages. */ @@ -53,7 +53,7 @@ static struct { int riscv_hartid_to_cpuid(int hartid) { - int i = -1; + int i; for (i = 0; i < NR_CPUS; i++) if (cpuid_to_hartid_map(i) == hartid) @@ -70,6 +70,12 @@ void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) for_each_cpu(cpu, in) cpumask_set_cpu(cpuid_to_hartid_map(cpu), out); } + +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_hartid_map(cpu); +} + /* Unsupported */ int setup_profiling_timer(unsigned int multiplier) { @@ -89,7 +95,7 @@ void riscv_software_interrupt(void) unsigned long *stats = ipi_data[smp_processor_id()].stats; /* Clear pending IPI */ - csr_clear(sip, SIE_SSIE); + csr_clear(CSR_SIP, SIE_SSIE); while (true) { unsigned long ops; @@ -199,52 +205,3 @@ void smp_send_reschedule(int cpu) send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE); } -/* - * Performs an icache flush for the given MM context. RISC-V has no direct - * mechanism for instruction cache shoot downs, so instead we send an IPI that - * informs the remote harts they need to flush their local instruction caches. - * To avoid pathologically slow behavior in a common case (a bunch of - * single-hart processes on a many-hart machine, ie 'make -j') we avoid the - * IPIs for harts that are not currently executing a MM context and instead - * schedule a deferred local instruction cache flush to be performed before - * execution resumes on each hart. - */ -void flush_icache_mm(struct mm_struct *mm, bool local) -{ - unsigned int cpu; - cpumask_t others, hmask, *mask; - - preempt_disable(); - - /* Mark every hart's icache as needing a flush for this MM. */ - mask = &mm->context.icache_stale_mask; - cpumask_setall(mask); - /* Flush this hart's I$ now, and mark it as flushed. */ - cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, mask); - local_flush_icache_all(); - - /* - * Flush the I$ of other harts concurrently executing, and mark them as - * flushed. - */ - cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu)); - local |= cpumask_empty(&others); - if (mm != current->active_mm || !local) { - cpumask_clear(&hmask); - riscv_cpuid_to_hartid_mask(&others, &hmask); - sbi_remote_fence_i(hmask.bits); - } else { - /* - * It's assumed that at least one strongly ordered operation is - * performed on this hart between setting a hart's cpumask bit - * and scheduling this MM context on that hart. Sending an SBI - * remote message will do this, but in the case where no - * messages are sent we still need to order this hart's writes - * with flush_icache_deferred(). - */ - smp_mb(); - } - - preempt_enable(); -} diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index eb533b5c2c8c..7a0b62252524 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -47,6 +47,17 @@ void __init smp_prepare_boot_cpu(void) void __init smp_prepare_cpus(unsigned int max_cpus) { + int cpuid; + + /* This covers non-smp usecase mandated by "nosmp" option */ + if (max_cpus == 0) + return; + + for_each_possible_cpu(cpuid) { + if (cpuid == smp_processor_id()) + continue; + set_cpu_present(cpuid, true); + } } void __init setup_smp(void) @@ -73,12 +84,19 @@ void __init setup_smp(void) } cpuid_to_hartid_map(cpuid) = hart; - set_cpu_possible(cpuid, true); - set_cpu_present(cpuid, true); cpuid++; } BUG_ON(!found_boot_cpu); + + if (cpuid > nr_cpu_ids) + pr_warn("Total number of cpus [%d] is greater than nr_cpus option value [%d]\n", + cpuid, nr_cpu_ids); + + for (cpuid = 1; cpuid < nr_cpu_ids; cpuid++) { + if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) + set_cpu_possible(cpuid, true); + } } int __cpu_up(unsigned int cpu, struct task_struct *tidle) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 4d403274c2e8..e80a5e8da119 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -33,9 +33,9 @@ static void notrace walk_stackframe(struct task_struct *task, unsigned long fp, sp, pc; if (regs) { - fp = GET_FP(regs); - sp = GET_USP(regs); - pc = GET_IP(regs); + fp = frame_pointer(regs); + sp = user_stack_pointer(regs); + pc = instruction_pointer(regs); } else if (task == NULL || task == current) { const register unsigned long current_sp __asm__ ("sp"); fp = (unsigned long)__builtin_frame_address(0); @@ -64,12 +64,8 @@ static void notrace walk_stackframe(struct task_struct *task, frame = (struct stackframe *)fp - 1; sp = fp; fp = frame->fp; -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR pc = ftrace_graph_ret_addr(current, NULL, frame->ra, (unsigned long *)(fp - 8)); -#else - pc = frame->ra - 0x4; -#endif } } @@ -82,8 +78,8 @@ static void notrace walk_stackframe(struct task_struct *task, unsigned long *ksp; if (regs) { - sp = GET_USP(regs); - pc = GET_IP(regs); + sp = user_stack_pointer(regs); + pc = instruction_pointer(regs); } else if (task == NULL || task == current) { const register unsigned long current_sp __asm__ ("sp"); sp = current_sp; diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 24a9333dda2c..3d1a651dc54c 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -70,7 +70,7 @@ void do_trap(struct pt_regs *regs, int signo, int code, && printk_ratelimit()) { pr_info("%s[%d]: unhandled signal %d code 0x%x at 0x" REG_FMT, tsk->comm, task_pid_nr(tsk), signo, code, addr); - print_vma_addr(KERN_CONT " in ", GET_IP(regs)); + print_vma_addr(KERN_CONT " in ", instruction_pointer(regs)); pr_cont("\n"); show_regs(regs); } @@ -118,6 +118,17 @@ DO_ERROR_INFO(do_trap_ecall_s, DO_ERROR_INFO(do_trap_ecall_m, SIGILL, ILL_ILLTRP, "environment call from M-mode"); +#ifdef CONFIG_GENERIC_BUG +static inline unsigned long get_break_insn_length(unsigned long pc) +{ + bug_insn_t insn; + + if (probe_kernel_address((bug_insn_t *)pc, insn)) + return 0; + return (((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? 4UL : 2UL); +} +#endif /* CONFIG_GENERIC_BUG */ + asmlinkage void do_trap_break(struct pt_regs *regs) { #ifdef CONFIG_GENERIC_BUG @@ -129,8 +140,8 @@ asmlinkage void do_trap_break(struct pt_regs *regs) case BUG_TRAP_TYPE_NONE: break; case BUG_TRAP_TYPE_WARN: - regs->sepc += sizeof(bug_insn_t); - return; + regs->sepc += get_break_insn_length(regs->sepc); + break; case BUG_TRAP_TYPE_BUG: die(regs, "Kernel BUG"); } @@ -145,11 +156,14 @@ int is_valid_bugaddr(unsigned long pc) { bug_insn_t insn; - if (pc < PAGE_OFFSET) + if (pc < VMALLOC_START) return 0; if (probe_kernel_address((bug_insn_t *)pc, insn)) return 0; - return (insn == __BUG_INSN); + if ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) + return (insn == __BUG_INSN_32); + else + return ((insn & __COMPRESSED_INSN_MASK) == __BUG_INSN_16); } #endif /* CONFIG_GENERIC_BUG */ @@ -159,9 +173,9 @@ void __init trap_init(void) * Set sup0 scratch register to 0, indicating to exception vector * that we are presently executing in the kernel */ - csr_write(sscratch, 0); + csr_write(CSR_SSCRATCH, 0); /* Set the exception vector address */ - csr_write(stvec, &handle_exception); + csr_write(CSR_STVEC, &handle_exception); /* Enable all interrupts */ - csr_write(sie, -1); + csr_write(CSR_SIE, -1); } diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index fec62b24df89..b07b765f312a 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -36,7 +36,7 @@ $(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE # these symbols in the kernel code rather than hand-coded addresses. SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(obj)/vdso-dummy.o: $(src)/vdso.lds $(obj)/rt_sigreturn.o FORCE $(call if_changed,vdsold) diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index b68aac701803..8db569141485 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -9,3 +9,5 @@ obj-y += fault.o obj-y += extable.o obj-y += ioremap.o obj-y += cacheflush.o +obj-y += context.o +obj-y += sifive_l2_cache.o diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 498c0a0814fe..497b7d07af0c 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -14,6 +14,67 @@ #include <asm/pgtable.h> #include <asm/cacheflush.h> +#ifdef CONFIG_SMP + +#include <asm/sbi.h> + +void flush_icache_all(void) +{ + sbi_remote_fence_i(NULL); +} + +/* + * Performs an icache flush for the given MM context. RISC-V has no direct + * mechanism for instruction cache shoot downs, so instead we send an IPI that + * informs the remote harts they need to flush their local instruction caches. + * To avoid pathologically slow behavior in a common case (a bunch of + * single-hart processes on a many-hart machine, ie 'make -j') we avoid the + * IPIs for harts that are not currently executing a MM context and instead + * schedule a deferred local instruction cache flush to be performed before + * execution resumes on each hart. + */ +void flush_icache_mm(struct mm_struct *mm, bool local) +{ + unsigned int cpu; + cpumask_t others, hmask, *mask; + + preempt_disable(); + + /* Mark every hart's icache as needing a flush for this MM. */ + mask = &mm->context.icache_stale_mask; + cpumask_setall(mask); + /* Flush this hart's I$ now, and mark it as flushed. */ + cpu = smp_processor_id(); + cpumask_clear_cpu(cpu, mask); + local_flush_icache_all(); + + /* + * Flush the I$ of other harts concurrently executing, and mark them as + * flushed. + */ + cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu)); + local |= cpumask_empty(&others); + if (mm != current->active_mm || !local) { + cpumask_clear(&hmask); + riscv_cpuid_to_hartid_mask(&others, &hmask); + sbi_remote_fence_i(hmask.bits); + } else { + /* + * It's assumed that at least one strongly ordered operation is + * performed on this hart between setting a hart's cpumask bit + * and scheduling this MM context on that hart. Sending an SBI + * remote message will do this, but in the case where no + * messages are sent we still need to order this hart's writes + * with flush_icache_deferred(). + */ + smp_mb(); + } + + preempt_enable(); +} + +#endif /* CONFIG_SMP */ + void flush_icache_pte(pte_t pte) { struct page *page = pte_page(pte); diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c new file mode 100644 index 000000000000..89ceb3cbe218 --- /dev/null +++ b/arch/riscv/mm/context.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + */ + +#include <linux/mm.h> +#include <asm/tlbflush.h> +#include <asm/cacheflush.h> + +/* + * When necessary, performs a deferred icache flush for the given MM context, + * on the local CPU. RISC-V has no direct mechanism for instruction cache + * shoot downs, so instead we send an IPI that informs the remote harts they + * need to flush their local instruction caches. To avoid pathologically slow + * behavior in a common case (a bunch of single-hart processes on a many-hart + * machine, ie 'make -j') we avoid the IPIs for harts that are not currently + * executing a MM context and instead schedule a deferred local instruction + * cache flush to be performed before execution resumes on each hart. This + * actually performs that local instruction cache flush, which implicitly only + * refers to the current hart. + */ +static inline void flush_icache_deferred(struct mm_struct *mm) +{ +#ifdef CONFIG_SMP + unsigned int cpu = smp_processor_id(); + cpumask_t *mask = &mm->context.icache_stale_mask; + + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); + /* + * Ensure the remote hart's writes are visible to this hart. + * This pairs with a barrier in flush_icache_mm. + */ + smp_mb(); + local_flush_icache_all(); + } + +#endif +} + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *task) +{ + unsigned int cpu; + + if (unlikely(prev == next)) + return; + + /* + * Mark the current MM context as inactive, and the next as + * active. This is at least used by the icache flushing + * routines in order to determine who should be flushed. + */ + cpu = smp_processor_id(); + + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Use the old spbtr name instead of using the current satp + * name to support binutils 2.29 which doesn't know about the + * privileged ISA 1.10 yet. + */ + csr_write(sptbr, virt_to_pfn(next->pgd) | SATP_MODE); + local_flush_tlb_all(); + + flush_icache_deferred(next); +} diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 88401d5125bc..cec8be9e2d6a 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -229,8 +229,9 @@ vmalloc_fault: pte_t *pte_k; int index; + /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) - goto bad_area; + return do_trap(regs, SIGSEGV, code, addr, tsk); /* * Synchronize this task's top level page-table @@ -239,13 +240,9 @@ vmalloc_fault: * Do _not_ use "tsk->active_mm->pgd" here. * We might be inside an interrupt in the middle * of a task switch. - * - * Note: Use the old spbtr name instead of using the current - * satp name to support binutils 2.29 which doesn't know about - * the privileged ISA 1.10 yet. */ index = pgd_index(addr); - pgd = (pgd_t *)pfn_to_virt(csr_read(sptbr)) + index; + pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) diff --git a/arch/riscv/mm/sifive_l2_cache.c b/arch/riscv/mm/sifive_l2_cache.c new file mode 100644 index 000000000000..4eb64619b3f4 --- /dev/null +++ b/arch/riscv/mm/sifive_l2_cache.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SiFive L2 cache controller Driver + * + * Copyright (C) 2018-2019 SiFive, Inc. + * + */ +#include <linux/debugfs.h> +#include <linux/interrupt.h> +#include <linux/of_irq.h> +#include <linux/of_address.h> +#include <asm/sifive_l2_cache.h> + +#define SIFIVE_L2_DIRECCFIX_LOW 0x100 +#define SIFIVE_L2_DIRECCFIX_HIGH 0x104 +#define SIFIVE_L2_DIRECCFIX_COUNT 0x108 + +#define SIFIVE_L2_DATECCFIX_LOW 0x140 +#define SIFIVE_L2_DATECCFIX_HIGH 0x144 +#define SIFIVE_L2_DATECCFIX_COUNT 0x148 + +#define SIFIVE_L2_DATECCFAIL_LOW 0x160 +#define SIFIVE_L2_DATECCFAIL_HIGH 0x164 +#define SIFIVE_L2_DATECCFAIL_COUNT 0x168 + +#define SIFIVE_L2_CONFIG 0x00 +#define SIFIVE_L2_WAYENABLE 0x08 +#define SIFIVE_L2_ECCINJECTERR 0x40 + +#define SIFIVE_L2_MAX_ECCINTR 3 + +static void __iomem *l2_base; +static int g_irq[SIFIVE_L2_MAX_ECCINTR]; + +enum { + DIR_CORR = 0, + DATA_CORR, + DATA_UNCORR, +}; + +#ifdef CONFIG_DEBUG_FS +static struct dentry *sifive_test; + +static ssize_t l2_write(struct file *file, const char __user *data, + size_t count, loff_t *ppos) +{ + unsigned int val; + + if (kstrtouint_from_user(data, count, 0, &val)) + return -EINVAL; + if ((val >= 0 && val < 0xFF) || (val >= 0x10000 && val < 0x100FF)) + writel(val, l2_base + SIFIVE_L2_ECCINJECTERR); + else + return -EINVAL; + return count; +} + +static const struct file_operations l2_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = l2_write +}; + +static void setup_sifive_debug(void) +{ + sifive_test = debugfs_create_dir("sifive_l2_cache", NULL); + + debugfs_create_file("sifive_debug_inject_error", 0200, + sifive_test, NULL, &l2_fops); +} +#endif + +static void l2_config_read(void) +{ + u32 regval, val; + + regval = readl(l2_base + SIFIVE_L2_CONFIG); + val = regval & 0xFF; + pr_info("L2CACHE: No. of Banks in the cache: %d\n", val); + val = (regval & 0xFF00) >> 8; + pr_info("L2CACHE: No. of ways per bank: %d\n", val); + val = (regval & 0xFF0000) >> 16; + pr_info("L2CACHE: Sets per bank: %llu\n", (uint64_t)1 << val); + val = (regval & 0xFF000000) >> 24; + pr_info("L2CACHE: Bytes per cache block: %llu\n", (uint64_t)1 << val); + + regval = readl(l2_base + SIFIVE_L2_WAYENABLE); + pr_info("L2CACHE: Index of the largest way enabled: %d\n", regval); +} + +static const struct of_device_id sifive_l2_ids[] = { + { .compatible = "sifive,fu540-c000-ccache" }, + { /* end of table */ }, +}; + +static ATOMIC_NOTIFIER_HEAD(l2_err_chain); + +int register_sifive_l2_error_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&l2_err_chain, nb); +} +EXPORT_SYMBOL_GPL(register_sifive_l2_error_notifier); + +int unregister_sifive_l2_error_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&l2_err_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_sifive_l2_error_notifier); + +static irqreturn_t l2_int_handler(int irq, void *device) +{ + unsigned int regval, add_h, add_l; + + if (irq == g_irq[DIR_CORR]) { + add_h = readl(l2_base + SIFIVE_L2_DIRECCFIX_HIGH); + add_l = readl(l2_base + SIFIVE_L2_DIRECCFIX_LOW); + pr_err("L2CACHE: DirError @ 0x%08X.%08X\n", add_h, add_l); + regval = readl(l2_base + SIFIVE_L2_DIRECCFIX_COUNT); + atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE, + "DirECCFix"); + } + if (irq == g_irq[DATA_CORR]) { + add_h = readl(l2_base + SIFIVE_L2_DATECCFIX_HIGH); + add_l = readl(l2_base + SIFIVE_L2_DATECCFIX_LOW); + pr_err("L2CACHE: DataError @ 0x%08X.%08X\n", add_h, add_l); + regval = readl(l2_base + SIFIVE_L2_DATECCFIX_COUNT); + atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE, + "DatECCFix"); + } + if (irq == g_irq[DATA_UNCORR]) { + add_h = readl(l2_base + SIFIVE_L2_DATECCFAIL_HIGH); + add_l = readl(l2_base + SIFIVE_L2_DATECCFAIL_LOW); + pr_err("L2CACHE: DataFail @ 0x%08X.%08X\n", add_h, add_l); + regval = readl(l2_base + SIFIVE_L2_DATECCFAIL_COUNT); + atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_UE, + "DatECCFail"); + } + + return IRQ_HANDLED; +} + +int __init sifive_l2_init(void) +{ + struct device_node *np; + struct resource res; + int i, rc; + + np = of_find_matching_node(NULL, sifive_l2_ids); + if (!np) + return -ENODEV; + + if (of_address_to_resource(np, 0, &res)) + return -ENODEV; + + l2_base = ioremap(res.start, resource_size(&res)); + if (!l2_base) + return -ENOMEM; + + for (i = 0; i < SIFIVE_L2_MAX_ECCINTR; i++) { + g_irq[i] = irq_of_parse_and_map(np, i); + rc = request_irq(g_irq[i], l2_int_handler, 0, "l2_ecc", NULL); + if (rc) { + pr_err("L2CACHE: Could not request IRQ %d\n", g_irq[i]); + return rc; + } + } + + l2_config_read(); + +#ifdef CONFIG_DEBUG_FS + setup_sifive_debug(); +#endif + return 0; +} +device_initcall(sifive_l2_init); diff --git a/arch/s390/Makefile b/arch/s390/Makefile index df1d6a150f30..de8521fc9de5 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -10,6 +10,8 @@ # Copyright (C) 1994 by Linus Torvalds # +KBUILD_DEFCONFIG := defconfig + LD_BFD := elf64-s390 KBUILD_LDFLAGS := -m elf64_s390 KBUILD_AFLAGS_MODULE += -fPIC diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index c51496bbac19..7cba96e7587b 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -58,7 +58,6 @@ define cmd_section_cmp touch $@ endef -OBJCOPYFLAGS_bzImage := --pad-to $$(readelf -s $(obj)/compressed/vmlinux | awk '/\<_end\>/ {print or(strtonum("0x"$$2),4095)+1}') $(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S index 112b8d9f1e4c..635217eb3d91 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/compressed/vmlinux.lds.S @@ -77,6 +77,8 @@ SECTIONS _compressed_start = .; *(.vmlinux.bin.compressed) _compressed_end = .; + FILL(0xff); + . = ALIGN(4096); } . = ALIGN(256); .bss : { diff --git a/arch/s390/defconfig b/arch/s390/configs/defconfig index c59b922cb6c5..c59b922cb6c5 100644 --- a/arch/s390/defconfig +++ b/arch/s390/configs/defconfig diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index f316de40e51b..27696755daa9 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -28,6 +28,7 @@ #define CPACF_KMCTR 0xb92d /* MSA4 */ #define CPACF_PRNO 0xb93c /* MSA5 */ #define CPACF_KMA 0xb929 /* MSA8 */ +#define CPACF_KDSA 0xb93a /* MSA9 */ /* * En/decryption modifier bits diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c47e22bba87f..bdbc81b5bc91 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -278,6 +278,7 @@ struct kvm_s390_sie_block { #define ECD_HOSTREGMGMT 0x20000000 #define ECD_MEF 0x08000000 #define ECD_ETOKENF 0x02000000 +#define ECD_ECC 0x00200000 __u32 ecd; /* 0x01c8 */ __u8 reserved1cc[18]; /* 0x01cc */ __u64 pp; /* 0x01de */ @@ -312,6 +313,7 @@ struct kvm_vcpu_stat { u64 halt_successful_poll; u64 halt_attempted_poll; u64 halt_poll_invalid; + u64 halt_no_poll_steal; u64 halt_wakeup; u64 instruction_lctl; u64 instruction_lctlg; diff --git a/arch/s390/include/asm/segment.h b/arch/s390/include/asm/segment.h deleted file mode 100644 index 97a0582b8d0f..000000000000 --- a/arch/s390/include/asm/segment.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SEGMENT_H -#define _ASM_SEGMENT_H - -#endif diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 16511d97e8dc..47104e5b47fd 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -152,7 +152,10 @@ struct kvm_s390_vm_cpu_subfunc { __u8 pcc[16]; /* with MSA4 */ __u8 ppno[16]; /* with MSA5 */ __u8 kma[16]; /* with MSA8 */ - __u8 reserved[1808]; + __u8 kdsa[16]; /* with MSA9 */ + __u8 sortl[32]; /* with STFLE.150 */ + __u8 dfltcc[32]; /* with STFLE.151 */ + __u8 reserved[1728]; }; /* kvm attributes for crypto */ diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index cd3df5514552..ad71132374f0 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -24,7 +24,6 @@ #include <linux/seccomp.h> #include <linux/compat.h> #include <trace/syscall.h> -#include <asm/segment.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 061418f787c3..e822b2964a83 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -430,3 +430,9 @@ 425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree sys_open_tree +429 common move_mount sys_move_mount sys_move_mount +430 common fsopen sys_fsopen sys_fsopen +431 common fsconfig sys_fsconfig sys_fsconfig +432 common fsmount sys_fsmount sys_fsmount +433 common fspick sys_fspick sys_fspick diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 1816ee48eadd..d3db3d7ed077 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -30,6 +30,7 @@ config KVM select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS + select HAVE_KVM_NO_POLL select SRCU select KVM_VFIO ---help--- diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1fd706f6206c..9dde4d7d8704 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -14,6 +14,7 @@ #include <linux/kvm_host.h> #include <linux/hrtimer.h> #include <linux/mmu_context.h> +#include <linux/nospec.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/bitmap.h> @@ -2307,6 +2308,7 @@ static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id) { if (id >= MAX_S390_IO_ADAPTERS) return NULL; + id = array_index_nospec(id, MAX_S390_IO_ADAPTERS); return kvm->arch.adapters[id]; } @@ -2320,8 +2322,13 @@ static int register_io_adapter(struct kvm_device *dev, (void __user *)attr->addr, sizeof(adapter_info))) return -EFAULT; - if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) || - (dev->kvm->arch.adapters[adapter_info.id] != NULL)) + if (adapter_info.id >= MAX_S390_IO_ADAPTERS) + return -EINVAL; + + adapter_info.id = array_index_nospec(adapter_info.id, + MAX_S390_IO_ADAPTERS); + + if (dev->kvm->arch.adapters[adapter_info.id] != NULL) return -EINVAL; adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4638303ba6a8..8d6d75db8de6 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -75,6 +75,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, + { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, { "instruction_lctl", VCPU_STAT(instruction_lctl) }, @@ -177,6 +178,11 @@ static int hpage; module_param(hpage, int, 0444); MODULE_PARM_DESC(hpage, "1m huge page backing support"); +/* maximum percentage of steal time for polling. >100 is treated like 100 */ +static u8 halt_poll_max_steal = 10; +module_param(halt_poll_max_steal, byte, 0644); +MODULE_PARM_DESC(hpage, "Maximum percentage of steal time to allow polling"); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -321,6 +327,22 @@ static inline int plo_test_bit(unsigned char nr) return cc == 0; } +static inline void __insn32_query(unsigned int opcode, u8 query[32]) +{ + register unsigned long r0 asm("0") = 0; /* query function */ + register unsigned long r1 asm("1") = (unsigned long) query; + + asm volatile( + /* Parameter regs are ignored */ + " .insn rrf,%[opc] << 16,2,4,6,0\n" + : "=m" (*query) + : "d" (r0), "a" (r1), [opc] "i" (opcode) + : "cc"); +} + +#define INSN_SORTL 0xb938 +#define INSN_DFLTCC 0xb939 + static void kvm_s390_cpu_feat_init(void) { int i; @@ -368,6 +390,16 @@ static void kvm_s390_cpu_feat_init(void) __cpacf_query(CPACF_KMA, (cpacf_mask_t *) kvm_s390_available_subfunc.kma); + if (test_facility(155)) /* MSA9 */ + __cpacf_query(CPACF_KDSA, (cpacf_mask_t *) + kvm_s390_available_subfunc.kdsa); + + if (test_facility(150)) /* SORTL */ + __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); + + if (test_facility(151)) /* DFLTCC */ + __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); + if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); /* @@ -513,9 +545,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; break; @@ -657,6 +686,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) set_kvm_facility(kvm->arch.model.fac_mask, 135); set_kvm_facility(kvm->arch.model.fac_list, 135); } + if (test_facility(148)) { + set_kvm_facility(kvm->arch.model.fac_mask, 148); + set_kvm_facility(kvm->arch.model.fac_list, 148); + } + if (test_facility(152)) { + set_kvm_facility(kvm->arch.model.fac_mask, 152); + set_kvm_facility(kvm->arch.model.fac_list, 152); + } r = 0; } else r = -EINVAL; @@ -1323,6 +1360,19 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx", ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + VM_EVENT(kvm, 3, "SET: guest KDSA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[1]); + VM_EVENT(kvm, 3, "SET: guest SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[3]); + VM_EVENT(kvm, 3, "SET: guest DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); return 0; } @@ -1491,6 +1541,19 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm, VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx", ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + VM_EVENT(kvm, 3, "GET: guest KDSA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[1]); + VM_EVENT(kvm, 3, "GET: guest SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[3]); + VM_EVENT(kvm, 3, "GET: guest DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); return 0; } @@ -1546,6 +1609,19 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx", ((unsigned long *) &kvm_s390_available_subfunc.kma)[0], ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]); + VM_EVENT(kvm, 3, "GET: host KDSA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kdsa)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kdsa)[1]); + VM_EVENT(kvm, 3, "GET: host SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[0], + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[1], + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[2], + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[3]); + VM_EVENT(kvm, 3, "GET: host DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1], + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2], + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]); return 0; } @@ -2817,6 +2893,25 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.enabled_gmap = vcpu->arch.gmap; } +static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr) +{ + if (test_bit_inv(nr, (unsigned long *)&kvm->arch.model.subfuncs.pckmo) && + test_bit_inv(nr, (unsigned long *)&kvm_s390_available_subfunc.pckmo)) + return true; + return false; +} + +static bool kvm_has_pckmo_ecc(struct kvm *kvm) +{ + /* At least one ECC subfunction must be present */ + return kvm_has_pckmo_subfunc(kvm, 32) || + kvm_has_pckmo_subfunc(kvm, 33) || + kvm_has_pckmo_subfunc(kvm, 34) || + kvm_has_pckmo_subfunc(kvm, 40) || + kvm_has_pckmo_subfunc(kvm, 41); + +} + static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) { /* @@ -2829,13 +2924,19 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); vcpu->arch.sie_block->eca &= ~ECA_APIE; + vcpu->arch.sie_block->ecd &= ~ECD_ECC; if (vcpu->kvm->arch.crypto.apie) vcpu->arch.sie_block->eca |= ECA_APIE; /* Set up protected key support */ - if (vcpu->kvm->arch.crypto.aes_kw) + if (vcpu->kvm->arch.crypto.aes_kw) { vcpu->arch.sie_block->ecb3 |= ECB3_AES; + /* ecc is also wrapped with AES key */ + if (kvm_has_pckmo_ecc(vcpu->kvm)) + vcpu->arch.sie_block->ecd |= ECD_ECC; + } + if (vcpu->kvm->arch.crypto.dea_kw) vcpu->arch.sie_block->ecb3 |= ECB3_DEA; } @@ -3068,6 +3169,17 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, } } +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + /* do not poll with more than halt_poll_max_steal percent of steal time */ + if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >= + halt_poll_max_steal) { + vcpu->stat.halt_no_poll_steal++; + return true; + } + return false; +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { /* kvm common code refers to this, but never calls it */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d62fa148558b..076090f9e666 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -288,7 +288,9 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) const u32 crycb_addr = crycbd_o & 0x7ffffff8U; unsigned long *b1, *b2; u8 ecb3_flags; + u32 ecd_flags; int apie_h; + int apie_s; int key_msk = test_kvm_facility(vcpu->kvm, 76); int fmt_o = crycbd_o & CRYCB_FORMAT_MASK; int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK; @@ -297,7 +299,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->crycbd = 0; apie_h = vcpu->arch.sie_block->eca & ECA_APIE; - if (!apie_h && (!key_msk || fmt_o == CRYCB_FORMAT0)) + apie_s = apie_h & scb_o->eca; + if (!apie_s && (!key_msk || (fmt_o == CRYCB_FORMAT0))) return 0; if (!crycb_addr) @@ -308,7 +311,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) ((crycb_addr + 128) & PAGE_MASK)) return set_validity_icpt(scb_s, 0x003CU); - if (apie_h && (scb_o->eca & ECA_APIE)) { + if (apie_s) { ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr, vcpu->kvm->arch.crypto.crycb, fmt_o, fmt_h); @@ -320,7 +323,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* we may only allow it if enabled for guest 2 */ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & (ECB3_AES | ECB3_DEA); - if (!ecb3_flags) + ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC; + if (!ecb3_flags && !ecd_flags) goto end; /* copy only the wrapping keys */ @@ -329,6 +333,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0035U); scb_s->ecb3 |= ecb3_flags; + scb_s->ecd |= ecd_flags; /* xor both blocks in one run */ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; @@ -339,7 +344,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) end: switch (ret) { case -EINVAL: - return set_validity_icpt(scb_s, 0x0020U); + return set_validity_icpt(scb_s, 0x0022U); case -EFAULT: return set_validity_icpt(scb_s, 0x0035U); case -EACCES: diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 01892dcf4029..0c1f257be422 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -28,7 +28,7 @@ static void __init kasan_early_panic(const char *reason) { sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); sclp_early_printk(reason); - disabled_wait(0); + disabled_wait(); } static void * __init kasan_early_alloc_segment(void) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index fd788e0f2e5b..cead9e0dcffb 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -93,6 +93,9 @@ static struct facility_def facility_defs[] = { 131, /* enhanced-SOP 2 and side-effect */ 139, /* multiple epoch facility */ 146, /* msa extension 8 */ + 150, /* enhanced sort */ + 151, /* deflate conversion */ + 155, /* msa extension 9 */ -1 /* END */ } }, diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 480b057556ee..016a727d4357 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -430,3 +430,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index a1dd24307b00..e047480b1605 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -473,3 +473,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/unicore32/configs/unicore32_defconfig b/arch/unicore32/configs/unicore32_defconfig index aebd01fc28e5..360cc9abcdb0 100644 --- a/arch/unicore32/configs/unicore32_defconfig +++ b/arch/unicore32/configs/unicore32_defconfig @@ -119,7 +119,7 @@ CONFIG_I2C_PUV3=y # Hardware Monitoring support #CONFIG_SENSORS_LM75=m # Generic Thermal sysfs driver -#CONFIG_THERMAL=m +#CONFIG_THERMAL=y #CONFIG_THERMAL_HWMON=y # Multimedia support diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index c93dc6478cb2..5fe2426bb7a5 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -28,7 +28,6 @@ generic-y += parport.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h -generic-y += segment.h generic-y += serial.h generic-y += shmparam.h generic-y += syscalls.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 21e9f2fac04b..2bbbd4d1ba31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -270,9 +270,6 @@ config GENERIC_BUG config GENERIC_BUG_RELATIVE_POINTERS bool -config GENERIC_HWEIGHT - def_bool y - config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4cd5f982b1e5..ad968b7bac72 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,12 +398,6 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq -387 i386 open_tree sys_open_tree __ia32_sys_open_tree -388 i386 move_mount sys_move_mount __ia32_sys_move_mount -389 i386 fsopen sys_fsopen __ia32_sys_fsopen -390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig -391 i386 fsmount sys_fsmount __ia32_sys_fsmount -392 i386 fspick sys_fspick __ia32_sys_fspick 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget @@ -438,3 +432,9 @@ 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register +428 i386 open_tree sys_open_tree __ia32_sys_open_tree +429 i386 move_mount sys_move_mount __ia32_sys_move_mount +430 i386 fsopen sys_fsopen __ia32_sys_fsopen +431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig +432 i386 fsmount sys_fsmount __ia32_sys_fsmount +433 i386 fspick sys_fspick __ia32_sys_fspick diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 64ca0d06259a..b4e6f9e6204a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,18 +343,18 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq -335 common open_tree __x64_sys_open_tree -336 common move_mount __x64_sys_move_mount -337 common fsopen __x64_sys_fsopen -338 common fsconfig __x64_sys_fsconfig -339 common fsmount __x64_sys_fsmount -340 common fspick __x64_sys_fspick # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter 427 common io_uring_register __x64_sys_io_uring_register +428 common open_tree __x64_sys_open_tree +429 common move_mount __x64_sys_move_mount +430 common fsopen __x64_sys_fsopen +431 common fsconfig __x64_sys_fsconfig +432 common fsmount __x64_sys_fsmount +433 common fspick __x64_sys_fspick # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 8e470b018512..3a4d8d4d39f8 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -73,14 +73,12 @@ const char *outfilename; enum { sym_vvar_start, sym_vvar_page, - sym_hpet_page, sym_pvclock_page, sym_hvclock_page, }; const int special_pages[] = { sym_vvar_page, - sym_hpet_page, sym_pvclock_page, sym_hvclock_page, }; @@ -93,7 +91,6 @@ struct vdso_sym { struct vdso_sym required_syms[] = { [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, - [sym_hpet_page] = {"hpet_page", true}, [sym_pvclock_page] = {"pvclock_page", true}, [sym_hvclock_page] = {"hvclock_page", true}, {"VDSO32_NOTE_MASK", true}, diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 7635c23f7d82..58a6993d7eb3 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -393,7 +393,7 @@ static __init int _init_events_attrs(void) return 0; } -const struct attribute_group *amd_iommu_attr_groups[] = { +static const struct attribute_group *amd_iommu_attr_groups[] = { &amd_iommu_format_group, &amd_iommu_cpumask_group, &amd_iommu_events_group, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ef763f535e3a..546d13e436aa 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2384,7 +2384,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(55, (unsigned long *)&status)) { handled++; - intel_pt_interrupt(); + if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && + perf_guest_cbs->handle_intel_pt_intr)) + perf_guest_cbs->handle_intel_pt_intr(); + else + intel_pt_interrupt(); } /* @@ -3265,7 +3269,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { - if (!(event->attr.freq || event->attr.wakeup_events)) { + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event))) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 07fc84bb85c1..a6ac2f4f76fc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -394,10 +394,10 @@ struct cpu_hw_events { /* Event constraint, but match on all event flags too. */ #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) #define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n) \ - EVENT_CONSTRAINT_RANGE(c, e, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) /* Check only flags, but allow all event/umask */ #define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index fc0693569f7a..ba88edd0d58b 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -12,8 +12,6 @@ #define REG_OUT "a" #endif -#define __HAVE_ARCH_SW_HWEIGHT - static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 62be73b23d5c..e8f58ddd06d9 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -10,6 +10,7 @@ extern struct e820_table *e820_table_firmware; extern unsigned long pci_mem_start; +extern bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type); extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type); extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c79abe7ca093..450d69a1e6fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -470,6 +470,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 global_ovf_ctrl_mask; u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; @@ -781,6 +782,9 @@ struct kvm_vcpu_arch { /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; + + /* AMD MSRC001_0015 Hardware Configuration */ + u64 msr_hwcr; }; struct kvm_lpage_info { @@ -1168,7 +1172,8 @@ struct kvm_x86_ops { uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); - int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); + int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired); void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 88dd202c8b00..979ef971cc78 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -789,6 +789,14 @@ #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +/* PERF_GLOBAL_OVF_CTL bits */ +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT 62 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT) +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT 63 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT) + /* Geode defined MSRs */ #define MSR_GEODE_BUSCONT_CONF0 0x00001900 diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 27566e57e87d..230474e2ddb5 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -19,7 +19,6 @@ struct vdso_image { long sym_vvar_start; /* Negative offset to the vvar area */ long sym_vvar_page; - long sym_hpet_page; long sym_pvclock_page; long sym_hvclock_page; long sym_VDSO32_NOTE_MASK; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2879e234e193..76dd605ee2a3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -73,12 +73,13 @@ EXPORT_SYMBOL(pci_mem_start); * This function checks if any part of the range <start,end> is mapped * with type. */ -bool e820__mapped_any(u64 start, u64 end, enum e820_type type) +static bool _e820__mapped_any(struct e820_table *table, + u64 start, u64 end, enum e820_type type) { int i; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; if (type && entry->type != type) continue; @@ -88,6 +89,17 @@ bool e820__mapped_any(u64 start, u64 end, enum e820_type type) } return 0; } + +bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) +{ + return _e820__mapped_any(e820_table_firmware, start, end, type); +} +EXPORT_SYMBOL_GPL(e820__mapped_raw_any); + +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) +{ + return _e820__mapped_any(e820_table, start, end, type); +} EXPORT_SYMBOL_GPL(e820__mapped_any); /* diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index cf52ee0d8711..9e4fa2484d10 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -768,7 +768,7 @@ static struct kprobe kretprobe_kprobe = { /* * Called from kretprobe_trampoline */ -static __used void *trampoline_handler(struct pt_regs *regs) +__used __visible void *trampoline_handler(struct pt_regs *regs) { struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7de466eb960b..8b6d03e55d2f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,7 +58,6 @@ #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/trace/mpx.h> -#include <asm/nospec-branch.h> #include <asm/mpx.h> #include <asm/vm86.h> #include <asm/umip.h> @@ -368,13 +367,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; - /* - * This situation can be triggered by userspace via - * modify_ldt(2) and the return does not take the regular - * user space exit, so a CPU buffer clear is required when - * MDS mitigation is enabled. - */ - mds_user_clear_cpu_buffers(); return; } #endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbbe611f0c49..80a642a0143d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -963,13 +963,13 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) return 1; - eax = kvm_register_read(vcpu, VCPU_REGS_RAX); - ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + eax = kvm_rax_read(vcpu); + ecx = kvm_rcx_read(vcpu); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true); - kvm_register_write(vcpu, VCPU_REGS_RAX, eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, edx); + kvm_rax_write(vcpu, eax); + kvm_rbx_write(vcpu, ebx); + kvm_rcx_write(vcpu, ecx); + kvm_rdx_write(vcpu, edx); return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index cc24b3a32c44..8ca4b39918e0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1535,10 +1535,10 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) longmode = is_64_bit_mode(vcpu); if (longmode) - kvm_register_write(vcpu, VCPU_REGS_RAX, result); + kvm_rax_write(vcpu, result); else { - kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); + kvm_rdx_write(vcpu, result >> 32); + kvm_rax_write(vcpu, result & 0xffffffff); } } @@ -1611,18 +1611,18 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) longmode = is_64_bit_mode(vcpu); if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + param = ((u64)kvm_rdx_read(vcpu) << 32) | + (kvm_rax_read(vcpu) & 0xffffffff); + ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | + (kvm_rcx_read(vcpu) & 0xffffffff); + outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | + (kvm_rsi_read(vcpu) & 0xffffffff); } #ifdef CONFIG_X86_64 else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + param = kvm_rcx_read(vcpu); + ingpa = kvm_rdx_read(vcpu); + outgpa = kvm_r8_read(vcpu); } #endif diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f8f56a93358b..1cc6c47dc77e 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,6 +9,34 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) +#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ +static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ +{ \ + return vcpu->arch.regs[VCPU_REGS_##uname]; \ +} \ +static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \ + unsigned long val) \ +{ \ + vcpu->arch.regs[VCPU_REGS_##uname] = val; \ +} +BUILD_KVM_GPR_ACCESSORS(rax, RAX) +BUILD_KVM_GPR_ACCESSORS(rbx, RBX) +BUILD_KVM_GPR_ACCESSORS(rcx, RCX) +BUILD_KVM_GPR_ACCESSORS(rdx, RDX) +BUILD_KVM_GPR_ACCESSORS(rbp, RBP) +BUILD_KVM_GPR_ACCESSORS(rsi, RSI) +BUILD_KVM_GPR_ACCESSORS(rdi, RDI) +#ifdef CONFIG_X86_64 +BUILD_KVM_GPR_ACCESSORS(r8, R8) +BUILD_KVM_GPR_ACCESSORS(r9, R9) +BUILD_KVM_GPR_ACCESSORS(r10, R10) +BUILD_KVM_GPR_ACCESSORS(r11, R11) +BUILD_KVM_GPR_ACCESSORS(r12, R12) +BUILD_KVM_GPR_ACCESSORS(r13, R13) +BUILD_KVM_GPR_ACCESSORS(r14, R14) +BUILD_KVM_GPR_ACCESSORS(r15, R15) +#endif + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -37,6 +65,16 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) kvm_register_write(vcpu, VCPU_REGS_RIP, val); } +static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RSP); +} + +static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RSP, val); +} + static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ @@ -83,8 +121,8 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) { - return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) - | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); + return (kvm_rax_read(vcpu) & -1u) + | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32); } static inline void enter_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bd13fdddbdc4..4924f83ed4f3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1454,7 +1454,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) if (swait_active(q)) swake_up_one(q); - if (apic_lvtt_tscdeadline(apic)) + if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; } @@ -1696,37 +1696,42 @@ static void cancel_hv_timer(struct kvm_lapic *apic) static bool start_hv_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; - int r; + struct kvm_vcpu *vcpu = apic->vcpu; + bool expired; WARN_ON(preemptible()); if (!kvm_x86_ops->set_hv_timer) return false; - if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) - return false; - if (!ktimer->tscdeadline) return false; - r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); - if (r < 0) + if (kvm_x86_ops->set_hv_timer(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; hrtimer_cancel(&ktimer->timer); /* - * Also recheck ktimer->pending, in case the sw timer triggered in - * the window. For periodic timer, leave the hv timer running for - * simplicity, and the deadline will be recomputed on the next vmexit. + * To simplify handling the periodic timer, leave the hv timer running + * even if the deadline timer has expired, i.e. rely on the resulting + * VM-Exit to recompute the periodic timer's target expiration. */ - if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { - if (r) + if (!apic_lvtt_period(apic)) { + /* + * Cancel the hv timer if the sw timer fired while the hv timer + * was being programmed, or if the hv timer itself expired. + */ + if (atomic_read(&ktimer->pending)) { + cancel_hv_timer(apic); + } else if (expired) { apic_timer_expired(apic); - return false; + cancel_hv_timer(apic); + } } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use); + return true; } @@ -1750,8 +1755,13 @@ static void start_sw_timer(struct kvm_lapic *apic) static void restart_apic_timer(struct kvm_lapic *apic) { preempt_disable(); + + if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending)) + goto out; + if (!start_hv_timer(apic)) start_sw_timer(apic); +out: preempt_enable(); } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7b45d231f..1e9ba81accba 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -44,6 +44,7 @@ #include <asm/page.h> #include <asm/pat.h> #include <asm/cmpxchg.h> +#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -487,16 +488,24 @@ static void kvm_mmu_reset_all_pte_masks(void) * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. */ - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_data.x86_phys_bits < + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_cache_bits; + if (boot_cpu_data.x86_cache_bits < 52 - shadow_nonpresent_or_rsvd_mask_len) { shadow_nonpresent_or_rsvd_mask = - rsvd_bits(boot_cpu_data.x86_phys_bits - + rsvd_bits(boot_cpu_data.x86_cache_bits - shadow_nonpresent_or_rsvd_mask_len, - boot_cpu_data.x86_phys_bits - 1); + boot_cpu_data.x86_cache_bits - 1); low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; - } + } else + WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF)); + shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } @@ -2892,7 +2901,9 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) */ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - return true; + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); } /* Bits which may be returned by set_spte() */ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index e9ea2d45ae66..9f72cc427158 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr) return false; } -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - static bool valid_mtrr_type(unsigned t) { return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ @@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return false; if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; + return kvm_pat_valid(data); } else if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 08715034e315..367a47df4ba0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct page *page; npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); - /* Check if the user is doing something meaningless. */ - if (unlikely(npages != 1)) - return -EFAULT; - - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); + if (likely(npages == 1)) { + table = kmap_atomic(page); + ret = CMPXCHG(&table[index], orig_pte, new_pte); + kunmap_atomic(table); + + kvm_release_page_dirty(page); + } else { + struct vm_area_struct *vma; + unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; + unsigned long pfn; + unsigned long paddr; + + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); + if (!vma || !(vma->vm_flags & VM_PFNMAP)) { + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + paddr = pfn << PAGE_SHIFT; + table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); + if (!table) { + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + ret = CMPXCHG(&table[index], orig_pte, new_pte); + memunmap(table); + up_read(¤t->mm->mmap_sem); + } return (ret != orig_pte); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6b92eaf4a3b1..a849dcb7fbc5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2091,7 +2091,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); - kvm_register_write(vcpu, VCPU_REGS_RDX, eax); + kvm_rdx_write(vcpu, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); @@ -3071,32 +3071,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm) return false; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) -{ - struct page *page; - - might_sleep(); - - page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - goto error; - - *_page = page; - - return kmap(page); - -error: - kvm_inject_gp(&svm->vcpu, 0); - - return NULL; -} - -static void nested_svm_unmap(struct page *page) -{ - kunmap(page); - kvm_release_page_dirty(page); -} - static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { unsigned port, size, iopm_len; @@ -3299,10 +3273,11 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr static int nested_svm_vmexit(struct vcpu_svm *svm) { + int rc; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + struct kvm_host_map map; trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, vmcb->control.exit_info_1, @@ -3311,9 +3286,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info_err, KVM_ISA_SVM); - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); - if (!nested_vmcb) + rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map); + if (rc) { + if (rc == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return 1; + } + + nested_vmcb = map.hva; /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); @@ -3408,16 +3388,16 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) } else { (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); - kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); - kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); + kvm_rax_write(&svm->vcpu, hsave->save.rax); + kvm_rsp_write(&svm->vcpu, hsave->save.rsp); + kvm_rip_write(&svm->vcpu, hsave->save.rip); svm->vmcb->save.dr7 = 0; svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; mark_all_dirty(svm->vmcb); - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); @@ -3483,7 +3463,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) } static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct page *page) + struct vmcb *nested_vmcb, struct kvm_host_map *map) { if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; @@ -3516,9 +3496,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, kvm_mmu_reset_context(&svm->vcpu); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); - kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); - kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); + kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); + kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ svm->vmcb->save.rax = nested_vmcb->save.rax; @@ -3567,7 +3547,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->control.pause_filter_thresh = nested_vmcb->control.pause_filter_thresh; - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, map, true); /* Enter Guest-Mode */ enter_guest_mode(&svm->vcpu); @@ -3587,17 +3567,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, static bool nested_svm_vmrun(struct vcpu_svm *svm) { + int rc; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + struct kvm_host_map map; u64 vmcb_gpa; vmcb_gpa = svm->vmcb->save.rax; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) + rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map); + if (rc) { + if (rc == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return false; + } + + nested_vmcb = map.hva; if (!nested_vmcb_checks(nested_vmcb)) { nested_vmcb->control.exit_code = SVM_EXIT_ERR; @@ -3605,7 +3591,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); return false; } @@ -3649,7 +3635,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); return true; } @@ -3673,21 +3659,26 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + struct kvm_host_map map; int ret; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); + if (ret) { + if (ret == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return 1; + } + + nested_vmcb = map.hva; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); return ret; } @@ -3695,21 +3686,26 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + struct kvm_host_map map; int ret; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); + if (ret) { + if (ret == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return 1; + } + + nested_vmcb = map.hva; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); return ret; } @@ -3791,11 +3787,11 @@ static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), - kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); + trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu), + kvm_rax_read(&svm->vcpu)); /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ - kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); + kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3803,7 +3799,7 @@ static int invlpga_interception(struct vcpu_svm *svm) static int skinit_interception(struct vcpu_svm *svm) { - trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); + trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu)); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -3817,7 +3813,7 @@ static int wbinvd_interception(struct vcpu_svm *svm) static int xsetbv_interception(struct vcpu_svm *svm) { u64 new_bv = kvm_read_edx_eax(&svm->vcpu); - u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u32 index = kvm_rcx_read(&svm->vcpu); if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; @@ -4213,7 +4209,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int rdmsr_interception(struct vcpu_svm *svm) { - u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(&svm->vcpu); struct msr_data msr_info; msr_info.index = ecx; @@ -4225,10 +4221,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) } else { trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, - msr_info.data & 0xffffffff); - kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, - msr_info.data >> 32); + kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); + kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -4422,7 +4416,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) static int wrmsr_interception(struct vcpu_svm *svm) { struct msr_data msr; - u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(&svm->vcpu); u64 data = kvm_read_edx_eax(&svm->vcpu); msr.data = data; @@ -6236,7 +6230,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *nested_vmcb; - struct page *page; + struct kvm_host_map map; u64 guest; u64 vmcb; @@ -6244,10 +6238,10 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); if (guest) { - nested_vmcb = nested_svm_map(svm, vmcb, &page); - if (!nested_vmcb) + if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; - enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); + nested_vmcb = map.hva; + enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map); } return 0; } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 854e144131c6..d6664ee3d127 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -2,6 +2,8 @@ #ifndef __KVM_X86_VMX_CAPS_H #define __KVM_X86_VMX_CAPS_H +#include <asm/vmx.h> + #include "lapic.h" extern bool __read_mostly enable_vpid; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0c601d079cd2..f1a69117ac0f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) if (!vmx->nested.hv_evmcs) return; - kunmap(vmx->nested.hv_evmcs_page); - kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); vmx->nested.hv_evmcs_vmptr = -1ull; - vmx->nested.hv_evmcs_page = NULL; vmx->nested.hv_evmcs = NULL; } @@ -229,16 +227,9 @@ static void free_nested(struct kvm_vcpu *vcpu) kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -519,39 +510,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { int msr; - struct page *page; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - /* - * pred_cmd & spec_ctrl are trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !pred_cmd && !spec_ctrl) - return false; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; - msr_bitmap_l1 = (unsigned long *)kmap(page); + msr_bitmap_l1 = (unsigned long *)map->hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -592,20 +563,42 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } } - if (spec_ctrl) + /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); + + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); + + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + + /* + * Checking the L0->L1 bitmap is trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_SPEC_CTRL, MSR_TYPE_R | MSR_TYPE_W); - if (pred_cmd) + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); return true; } @@ -613,20 +606,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_host_map map; struct vmcs12 *shadow; - struct page *page; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == -1ull) return; shadow = get_shadow_vmcs12(vcpu); - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - memcpy(shadow, kmap(page), VMCS12_SIZE); + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + return; - kunmap(page); - kvm_release_page_clean(page); + memcpy(shadow, map.hva, VMCS12_SIZE); + kvm_vcpu_unmap(vcpu, &map, false); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, @@ -930,7 +923,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { if (!nested_cr3_valid(vcpu, cr3)) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* @@ -941,7 +934,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; - return 1; + return -EINVAL; } } } @@ -1794,13 +1787,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, nested_release_evmcs(vcpu); - vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( - vcpu, assist_page.current_nested_vmcs); - - if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs), + &vmx->nested.hv_evmcs_map)) return 0; - vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; /* * Currently, KVM only supports eVMCS version 1 @@ -2373,19 +2364,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ if (vmx->emulation_required) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) - return 1; + return -EINVAL; if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + kvm_rsp_write(vcpu, vmcs12->guest_rsp); + kvm_rip_write(vcpu, vmcs12->guest_rip); return 0; } @@ -2589,11 +2580,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Host Control Registers and MSRs - */ -static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { bool ia32e; @@ -2606,6 +2605,10 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->host_ia32_pat)) + return -EINVAL; + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2624,41 +2627,12 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Guest Non-register State - */ -static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) -{ - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_check_vm_execution_controls(vcpu, vmcs12) || - nested_check_vm_exit_controls(vcpu, vmcs12) || - nested_check_vm_entry_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_check_host_control_regs(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - - if (nested_check_guest_non_reg_state(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - return 0; -} - static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r; - struct page *page; + int r = 0; struct vmcs12 *shadow; + struct kvm_host_map map; if (vmcs12->vmcs_link_pointer == -1ull) return 0; @@ -2666,23 +2640,34 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) return -EINVAL; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) return -EINVAL; - r = 0; - shadow = kmap(page); + shadow = map.hva; + if (shadow->hdr.revision_id != VMCS12_REVISION || shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) r = -EINVAL; - kunmap(page); - kvm_release_page_clean(page); + + kvm_vcpu_unmap(vcpu, &map, false); return r; } -static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 *exit_qual) +/* + * Checks related to Guest Non-register State + */ +static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) +{ + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 *exit_qual) { bool ia32e; @@ -2690,11 +2675,15 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) - return 1; + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->guest_ia32_pat)) + return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; - return 1; + return -EINVAL; } /* @@ -2713,13 +2702,16 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) - return 1; + return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) - return 1; + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return -EINVAL; + + if (nested_check_guest_non_reg_state(vmcs12)) + return -EINVAL; return 0; } @@ -2832,6 +2824,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; struct page *page; u64 hpa; @@ -2864,20 +2857,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); + map = &vmx->nested.virtual_apic_map; /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. */ - if (!is_error_page(page)) { - vmx->nested.virtual_apic_page = page; - hpa = page_to_phys(vmx->nested.virtual_apic_page); - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { @@ -2898,26 +2885,15 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has_posted_intr(vmcs12)) { - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); + map = &vmx->nested.pi_desc_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { + vmx->nested.pi_desc = + (struct pi_desc *)(((void *)map->hva) + + offset_in_page(vmcs12->posted_intr_desc_addr)); + vmcs_write64(POSTED_INTR_DESC_ADDR, + pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); - if (is_error_page(page)) - return; - vmx->nested.pi_desc_page = page; - vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, @@ -3000,7 +2976,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return -1; } - if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) goto vmentry_fail_vmexit; } @@ -3145,9 +3121,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); + if (nested_vmx_check_controls(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + if (nested_vmx_check_host_state(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with @@ -3310,11 +3288,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { - vapic_page = kmap(vmx->nested.virtual_apic_page); + vapic_page = vmx->nested.virtual_apic_map.hva; + if (!vapic_page) + return; + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); - kunmap(vmx->nested.virtual_apic_page); - status = vmcs_read16(GUEST_INTR_STATUS); if ((u8)max_irr > ((u8)status & 0xff)) { status &= ~0xff; @@ -3425,8 +3404,8 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); + vmcs12->guest_rsp = kvm_rsp_read(vcpu); + vmcs12->guest_rip = kvm_rip_read(vcpu); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); @@ -3609,8 +3588,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); vmx_set_efer(vcpu, vcpu->arch.efer); - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + kvm_rsp_write(vcpu, vmcs12->host_rsp); + kvm_rip_write(vcpu, vmcs12->host_rip); vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); vmx_set_interrupt_shadow(vcpu, 0); @@ -3955,16 +3934,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; /* * We are now running in L2, mmu_notifier will force to reload the @@ -4260,7 +4232,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; - struct page *page; + uint32_t revision; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -4306,20 +4278,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failInvalid(vcpu); - - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failInvalid(vcpu); - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - kvm_release_page_clean(page); + if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || + revision != VMCS12_REVISION) return nested_vmx_failInvalid(vcpu); - } - kunmap(page); - kvm_release_page_clean(page); vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); @@ -4377,7 +4341,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); @@ -4385,7 +4349,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - if (vmx->nested.hv_evmcs_page) { + if (vmx->nested.hv_evmcs_map.hva) { if (vmptr == vmx->nested.hv_evmcs_vmptr) nested_release_evmcs(vcpu); } else { @@ -4584,7 +4548,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); @@ -4597,11 +4561,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { + struct kvm_host_map map; struct vmcs12 *new_vmcs12; - struct page *page; - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -4611,12 +4574,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = kmap(page); + + new_vmcs12 = map.hva; + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -4628,8 +4592,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * cached. */ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &map, false); set_current_vmptr(vmx, vmptr); } @@ -4804,7 +4767,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 index = kvm_rcx_read(vcpu); u64 address; bool accessed_dirty; struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -4850,7 +4813,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; - u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + u32 function = kvm_rax_read(vcpu); /* * VMFUNC is only supported for nested guests, but we always enable the @@ -4936,7 +4899,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason) { - u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 msr_index = kvm_rcx_read(vcpu); gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) @@ -5373,9 +5336,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != 0) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) - nested_enable_evmcs(vcpu, NULL); - if (!nested_vmx_allowed(vcpu)) return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; @@ -5417,6 +5377,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->vmx.vmxon_pa == -1ull) return 0; + if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) + nested_enable_evmcs(vcpu, NULL); + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; ret = enter_vmx_operation(vcpu); if (ret) @@ -5460,9 +5423,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return 0; - vmx->nested.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); @@ -5480,14 +5440,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; } - if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || - nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_controls(vcpu, vmcs12) || + nested_vmx_check_host_state(vcpu, vmcs12) || + nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) return -EINVAL; vmx->nested.dirty_vmcs12 = true; + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + ret = nested_vmx_enter_non_root_mode(vcpu, false); - if (ret) + if (ret) { + vmx->nested.nested_run_pending = 0; return -EINVAL; + } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ab4a364348e..f8502c376b37 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; @@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); + if (kvm_x86_ops->pt_supported()) + pmu->global_ovf_ctrl_mask &= + ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e1fa935a545f..1ac167614032 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1692,6 +1692,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_POWER_CTL: + msr_info->data = vmx->msr_ia32_power_ctl; + break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1822,6 +1825,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_POWER_CTL: + vmx->msr_ia32_power_ctl = data; + break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1891,7 +1897,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; @@ -2288,7 +2294,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_CLEAR_BNDCFGS | @@ -3619,14 +3624,13 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) return false; rvi = vmx_get_rvi(); - vapic_page = kmap(vmx->nested.virtual_apic_page); + vapic_page = vmx->nested.virtual_apic_map.hva; vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - kunmap(vmx->nested.virtual_apic_page); return ((rvi & 0xf0) > (vppr & 0xf0)); } @@ -4827,7 +4831,7 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 ecx = kvm_rcx_read(vcpu); struct msr_data msr_info; msr_info.index = ecx; @@ -4840,18 +4844,16 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_read(ecx, msr_info.data); - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; + kvm_rax_write(vcpu, msr_info.data & -1u); + kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u); return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) { struct msr_data msr; - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); msr.data = data; msr.index = ecx; @@ -4922,7 +4924,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) static int handle_xsetbv(struct kvm_vcpu *vcpu) { u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + u32 index = kvm_rcx_read(vcpu); if (kvm_set_xcr(vcpu, index, new_bv) == 0) return kvm_skip_emulated_instruction(vcpu); @@ -5723,8 +5725,16 @@ void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); + } + pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); + pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) @@ -6856,30 +6866,6 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } -static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *entry; - union cpuid10_eax eax; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) - return false; - - eax.full = entry->eax; - return (eax.split.version_id > 0); -} - -static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool pmu_enabled = guest_cpuid_has_pmu(vcpu); - - if (pmu_enabled) - vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING; - else - vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING; -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6968,7 +6954,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); - nested_vmx_procbased_ctls_update(vcpu); } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && @@ -7028,7 +7013,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; @@ -7051,10 +7037,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && - u64_shl_div_u64(delta_tsc, + delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, - &delta_tsc)) + vcpu->arch.tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* @@ -7067,7 +7052,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - return delta_tsc == 0; + *expired = !delta_tsc; + return 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) @@ -7104,9 +7090,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa; - struct page *page = NULL; - u64 *pml_address; + gpa_t gpa, dst; if (is_guest_mode(vcpu)) { WARN_ON_ONCE(vmx->nested.pml_full); @@ -7126,15 +7110,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); - if (is_error_page(page)) + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) return 0; - pml_address = kmap(page); - pml_address[vmcs12->guest_pml_index--] = gpa; - kunmap(page); - kvm_release_page_clean(page); + vmcs12->guest_pml_index--; } return 0; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f879529906b4..63d37ccce3dc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -142,8 +142,11 @@ struct nested_vmx { * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; + struct kvm_host_map virtual_apic_map; + struct kvm_host_map pi_desc_map; + + struct kvm_host_map msr_bitmap_map; + struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; @@ -169,7 +172,7 @@ struct nested_vmx { } smm; gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; + struct kvm_host_map hv_evmcs_map; struct hv_enlightened_vmcs *hv_evmcs; }; @@ -257,6 +260,8 @@ struct vcpu_vmx { unsigned long host_debugctlmsr; + u64 msr_ia32_power_ctl; + /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9591abde62a..536b78c4af6e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1100,15 +1100,15 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); bool kvm_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(vcpu); u64 data; int err; err = kvm_pmu_rdpmc(vcpu, ecx, &data); if (err) return err; - kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); - kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); + kvm_rax_write(vcpu, (u32)data); + kvm_rdx_write(vcpu, data >> 32); return err; } EXPORT_SYMBOL_GPL(kvm_rdpmc); @@ -1174,6 +1174,9 @@ static u32 emulated_msrs[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_IA32_POWER_CTL, + + MSR_K7_HWCR, }; static unsigned num_emulated_msrs; @@ -1262,31 +1265,49 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return 0; } -bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) - return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) - return false; + return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return false; + return false; + + if (efer & (EFER_LME | EFER_LMA) && + !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return false; + + if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + return false; return true; + +} +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & efer_reserved_bits) + return false; + + return __kvm_valid_efer(vcpu, efer); } EXPORT_SYMBOL_GPL(kvm_valid_efer); -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; + u64 efer = msr_info->data; - if (!kvm_valid_efer(vcpu, efer)) - return 1; + if (efer & efer_reserved_bits) + return false; - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + if (!msr_info->host_initiated) { + if (!__kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + } efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; @@ -2279,6 +2300,18 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* + * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. + */ +static bool can_set_mci_status(struct kvm_vcpu *vcpu) +{ + /* McStatusWrEn enabled? */ + if (guest_cpuid_is_amd(vcpu)) + return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); + + return false; +} + static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2310,9 +2343,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; + + /* MCi_STATUS */ if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) - return -1; + (offset & 0x3) == 1 && data != 0) { + if (!can_set_mci_status(vcpu)) + return -1; + } + vcpu->arch.mce_banks[offset] = data; break; } @@ -2456,13 +2494,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.arch_capabilities = data; break; case MSR_EFER: - return set_efer(vcpu, data); + return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - data &= ~(u64)0x40000; /* ignore Mc status write enable */ - if (data != 0) { + + /* Handle McStatusWrEn */ + if (data == BIT_ULL(18)) { + vcpu->arch.msr_hwcr = data; + } else if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); return 1; @@ -2736,7 +2777,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K8_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: - case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: @@ -2900,6 +2940,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_MISC_FEATURES_ENABLES: msr_info->data = vcpu->arch.msr_misc_features_enables; break; + case MSR_K7_HWCR: + msr_info->data = vcpu->arch.msr_hwcr; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -3079,9 +3122,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; @@ -5521,9 +5561,9 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned int bytes, struct x86_exception *exception) { + struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; - struct page *page; char *kaddr; bool exchanged; @@ -5540,12 +5580,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) goto emul_write; - kaddr = kmap_atomic(page); - kaddr += offset_in_page(gpa); + kaddr = map.hva + offset_in_page(gpa); + switch (bytes) { case 1: exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); @@ -5562,13 +5601,12 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr); - kvm_release_page_dirty(page); + + kvm_vcpu_unmap(vcpu, &map, true); if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); kvm_page_track_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6558,7 +6596,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + unsigned long val = kvm_rax_read(vcpu); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); if (ret) @@ -6593,8 +6631,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) } /* For size less than 4 we merge, else we zero extend */ - val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) - : 0; + val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform @@ -6602,7 +6639,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) */ emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); - kvm_register_write(vcpu, VCPU_REGS_RAX, val); + kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); } @@ -6614,12 +6651,12 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, int ret; /* For size less than 4 we merge, else we zero extend */ - val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + val = (size < 4) ? kvm_rax_read(vcpu) : 0; ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); if (ret) { - kvm_register_write(vcpu, VCPU_REGS_RAX, val); + kvm_rax_write(vcpu, val); return ret; } @@ -6854,10 +6891,20 @@ static unsigned long kvm_get_guest_ip(void) return ip; } +static void kvm_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); +} + static struct perf_guest_info_callbacks kvm_guest_cbs = { .is_in_guest = kvm_is_in_guest, .is_user_mode = kvm_is_user_mode, .get_guest_ip = kvm_get_guest_ip, + .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; static void kvm_set_mmio_spte_mask(void) @@ -7133,11 +7180,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); trace_kvm_hypercall(nr, a0, a1, a2, a3); @@ -7178,7 +7225,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) out: if (!op_64_bit) ret = (u32)ret; - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + kvm_rax_write(vcpu, ret); ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); @@ -8280,23 +8327,23 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); - regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); - regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); - regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); - regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); - regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); + regs->rax = kvm_rax_read(vcpu); + regs->rbx = kvm_rbx_read(vcpu); + regs->rcx = kvm_rcx_read(vcpu); + regs->rdx = kvm_rdx_read(vcpu); + regs->rsi = kvm_rsi_read(vcpu); + regs->rdi = kvm_rdi_read(vcpu); + regs->rsp = kvm_rsp_read(vcpu); + regs->rbp = kvm_rbp_read(vcpu); #ifdef CONFIG_X86_64 - regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); - regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); - regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); - regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); - regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); - regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); - regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); - regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); + regs->r8 = kvm_r8_read(vcpu); + regs->r9 = kvm_r9_read(vcpu); + regs->r10 = kvm_r10_read(vcpu); + regs->r11 = kvm_r11_read(vcpu); + regs->r12 = kvm_r12_read(vcpu); + regs->r13 = kvm_r13_read(vcpu); + regs->r14 = kvm_r14_read(vcpu); + regs->r15 = kvm_r15_read(vcpu); #endif regs->rip = kvm_rip_read(vcpu); @@ -8316,23 +8363,23 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.emulate_regs_need_sync_from_vcpu = true; vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); - kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); - kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); - kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); - kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); - kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); - kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); - kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); + kvm_rax_write(vcpu, regs->rax); + kvm_rbx_write(vcpu, regs->rbx); + kvm_rcx_write(vcpu, regs->rcx); + kvm_rdx_write(vcpu, regs->rdx); + kvm_rsi_write(vcpu, regs->rsi); + kvm_rdi_write(vcpu, regs->rdi); + kvm_rsp_write(vcpu, regs->rsp); + kvm_rbp_write(vcpu, regs->rbp); #ifdef CONFIG_X86_64 - kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); - kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); - kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); - kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); - kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); - kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); - kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); - kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + kvm_r8_write(vcpu, regs->r8); + kvm_r9_write(vcpu, regs->r9); + kvm_r10_write(vcpu, regs->r10); + kvm_r11_write(vcpu, regs->r11); + kvm_r12_write(vcpu, regs->r12); + kvm_r13_write(vcpu, regs->r13); + kvm_r14_write(vcpu, regs->r14); + kvm_r15_write(vcpu, regs->r15); #endif kvm_rip_write(vcpu, regs->rip); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 534d3f28bb01..a470ff0868c5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -345,6 +345,16 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } + +static inline bool kvm_pat_valid(u64 data) +{ + if (data & 0xF8F8F8F8F8F8F8F8ull) + return false; + /* 0, 1, 4, 5, 6, 7 are valid values. */ + return (data | ((data & 0x0202020202020202ull) << 1)) == data; +} + void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 20d14254b686..62fc457f3849 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,6 +58,37 @@ #include "ident_map.c" +#define DEFINE_POPULATE(fname, type1, type2, init) \ +static inline void fname##_init(struct mm_struct *mm, \ + type1##_t *arg1, type2##_t *arg2, bool init) \ +{ \ + if (init) \ + fname##_safe(mm, arg1, arg2); \ + else \ + fname(mm, arg1, arg2); \ +} + +DEFINE_POPULATE(p4d_populate, p4d, pud, init) +DEFINE_POPULATE(pgd_populate, pgd, p4d, init) +DEFINE_POPULATE(pud_populate, pud, pmd, init) +DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init) + +#define DEFINE_ENTRY(type1, type2, init) \ +static inline void set_##type1##_init(type1##_t *arg1, \ + type2##_t arg2, bool init) \ +{ \ + if (init) \ + set_##type1##_safe(arg1, arg2); \ + else \ + set_##type1(arg1, arg2); \ +} + +DEFINE_ENTRY(p4d, p4d, init) +DEFINE_ENTRY(pud, pud, init) +DEFINE_ENTRY(pmd, pmd, init) +DEFINE_ENTRY(pte, pte, init) + + /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -414,7 +445,7 @@ void __init cleanup_highmap(void) */ static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, - pgprot_t prot) + pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pte_safe(pte, __pte(0)); + set_pte_init(pte, __pte(0), init); continue; } @@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } @@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, */ static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask, pgprot_t prot) + unsigned long page_size_mask, pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pmd_safe(pmd, __pmd(0)); + set_pmd_init(pmd, __pmd(0), init); continue; } @@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); paddr_last = phys_pte_init(pte, paddr, - paddr_end, prot); + paddr_end, prot, + init); spin_unlock(&init_mm.page_table_lock); continue; } @@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte_safe((pte_t *)pmd, - pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + set_pte_init((pte_t *)pmd, + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE)), + init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; } pte = alloc_low_page(); - paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel_safe(&init_mm, pmd, pte); + pmd_populate_kernel_init(&init_mm, pmd, pte, init); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, */ static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask) + unsigned long page_size_mask, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pud_safe(pud, __pud(0)); + set_pud_init(pud, __pud(0), init); continue; } @@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pmd_init(pmd, paddr, paddr_end, page_size_mask, - prot); + prot, init); continue; } /* @@ -610,9 +643,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte_safe((pte_t *)pud, - pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE)); + set_pte_init((pte_t *)pud, + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE), + init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; @@ -620,10 +654,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, pmd = alloc_low_page(); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, - page_size_mask, prot); + page_size_mask, prot, init); spin_lock(&init_mm.page_table_lock); - pud_populate_safe(&init_mm, pud, pmd); + pud_populate_init(&init_mm, pud, pmd, init); spin_unlock(&init_mm.page_table_lock); } @@ -634,14 +668,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, static unsigned long __meminit phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask) + unsigned long page_size_mask, bool init) { unsigned long paddr_next, paddr_last = paddr_end; unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); if (!pgtable_l5_enabled()) - return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, + page_size_mask, init); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { p4d_t *p4d; @@ -657,39 +692,34 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_p4d_safe(p4d, __p4d(0)); + set_p4d_init(p4d, __p4d(0), init); continue; } if (!p4d_none(*p4d)) { pud = pud_offset(p4d, 0); - paddr_last = phys_pud_init(pud, paddr, - paddr_end, - page_size_mask); + paddr_last = phys_pud_init(pud, paddr, paddr_end, + page_size_mask, init); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, paddr, paddr_end, - page_size_mask); + page_size_mask, init); spin_lock(&init_mm.page_table_lock); - p4d_populate_safe(&init_mm, p4d, pud); + p4d_populate_init(&init_mm, p4d, pud, init); spin_unlock(&init_mm.page_table_lock); } return paddr_last; } -/* - * Create page table mapping for the physical memory for specific physical - * addresses. The virtual and physical addresses have to be aligned on PMD level - * down. It returns the last physical address mapped. - */ -unsigned long __meminit -kernel_physical_mapping_init(unsigned long paddr_start, - unsigned long paddr_end, - unsigned long page_size_mask) +static unsigned long __meminit +__kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask, + bool init) { bool pgd_changed = false; unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; @@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start, p4d = (p4d_t *)pgd_page_vaddr(*pgd); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), - page_size_mask); + page_size_mask, + init); continue; } p4d = alloc_low_page(); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), - page_size_mask); + page_size_mask, init); spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) - pgd_populate_safe(&init_mm, pgd, p4d); + pgd_populate_init(&init_mm, pgd, p4d, init); else - p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), + (pud_t *) p4d, init); + spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -732,6 +765,37 @@ kernel_physical_mapping_init(unsigned long paddr_start, return paddr_last; } + +/* + * Create page table mapping for the physical memory for specific physical + * addresses. Note that it can only be used to populate non-present entries. + * The virtual and physical addresses have to be aligned on PMD level + * down. It returns the last physical address mapped. + */ +unsigned long __meminit +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, true); +} + +/* + * This function is similar to kernel_physical_mapping_init() above with the + * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() + * when updating the mapping. The caller is responsible to flush the TLBs after + * the function returns. + */ +unsigned long __meminit +kernel_physical_mapping_change(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, false); +} + #ifndef CONFIG_NUMA void __init initmem_init(void) { diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 385afa2b9e17..51f50a7a07ef 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -301,9 +301,13 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr, else split_page_size_mask = 1 << PG_LEVEL_2M; - kernel_physical_mapping_init(__pa(vaddr & pmask), - __pa((vaddr_end & pmask) + psize), - split_page_size_mask); + /* + * kernel_physical_mapping_change() does not flush the TLBs, so + * a TLB flush is required after we exit from the for loop. + */ + kernel_physical_mapping_change(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); } ret = 0; diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 319bde386d5f..eeae142062ed 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void); unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); +unsigned long kernel_physical_mapping_change(unsigned long start, + unsigned long end, + unsigned long page_size_mask); void zone_sizes_init(void); extern int after_bootmem; diff --git a/arch/xtensa/include/asm/segment.h b/arch/xtensa/include/asm/segment.h deleted file mode 100644 index 98964ad15ca2..000000000000 --- a/arch/xtensa/include/asm/segment.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * include/asm-xtensa/segment.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_SEGMENT_H -#define _XTENSA_SEGMENT_H - -#include <linux/uaccess.h> - -#endif /* _XTENSA_SEGEMENT_H */ diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 30084eaf8422..5fa0ee1c8e00 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -398,3 +398,9 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick diff --git a/arch/xtensa/platforms/xtfpga/setup.c b/arch/xtensa/platforms/xtfpga/setup.c index 820e8738af11..b1506376d502 100644 --- a/arch/xtensa/platforms/xtfpga/setup.c +++ b/arch/xtensa/platforms/xtfpga/setup.c @@ -18,6 +18,7 @@ #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/io.h> #include <linux/errno.h> #include <linux/reboot.h> #include <linux/kdev_t.h> |