diff options
Diffstat (limited to 'arch')
896 files changed, 16924 insertions, 15390 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 8519d9f42e33..56b6ccc0e32d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -24,6 +24,9 @@ config KEXEC_ELF config HAVE_IMA_KEXEC bool +config SET_FS + bool + config HOTPLUG_SMT bool @@ -420,6 +423,13 @@ config MMU_GATHER_NO_GATHER bool depends on MMU_GATHER_TABLE_FREE +config ARCH_WANT_IRQS_OFF_ACTIVATE_MM + bool + help + Temporary select until all architectures can be converted to have + irqs disabled over activate_mm. Architectures that do IPI based TLB + shootdowns should enable this. + config ARCH_HAVE_NMI_SAFE_CMPXCHG bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9c5f06e8eb9b..d6e9fc7a7b19 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -39,6 +39,7 @@ config ALPHA select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 select MMU_GATHER_NO_RANGE + select SET_FS help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 81037907268d..d84b19aa8e9d 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -11,7 +11,7 @@ #include <linux/export.h> #include <linux/scatterlist.h> #include <linux/log2.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/iommu-helper.h> #include <asm/io.h> @@ -141,12 +141,7 @@ iommu_arena_find_pages(struct device *dev, struct pci_iommu_arena *arena, unsigned long boundary_size; base = arena->dma_base >> PAGE_SHIFT; - if (dev) { - boundary_size = dma_get_seg_boundary(dev) + 1; - boundary_size >>= PAGE_SHIFT; - } else { - boundary_size = 1UL << (32 - PAGE_SHIFT); - } + boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT); /* Search forward for the first mask-aligned sequence of N free ptes */ ptes = arena->ptes; @@ -957,5 +952,7 @@ const struct dma_map_ops alpha_pci_ops = { .dma_supported = alpha_pci_supported, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(alpha_pci_ops); diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 15bc9d1e79f4..3739efce1ec0 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -531,7 +531,6 @@ do_work_pending(struct pt_regs *regs, unsigned long thread_flags, do_signal(regs, r0, r19); r0 = 0; } else { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } } diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ec8bed9e7b75..ee7b01bb7346 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -479,3 +479,4 @@ 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 +550 common process_madvise sys_process_madvise diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ba00c4e1e1c2..0a89cc9def65 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -48,6 +48,7 @@ config ARC select PCI_SYSCALL if PCI select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 + select SET_FS config ARCH_HAS_CACHE_LINE_SIZE def_bool y @@ -96,8 +97,6 @@ menu "ARC Platform/SoC/Board" source "arch/arc/plat-tb10x/Kconfig" source "arch/arc/plat-axs10x/Kconfig" -#New platform adds here -source "arch/arc/plat-eznps/Kconfig" source "arch/arc/plat-hsdk/Kconfig" endmenu diff --git a/arch/arc/Makefile b/arch/arc/Makefile index d00f8b8afd08..0c6bf0d1df7a 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -94,13 +94,8 @@ core-y += arch/arc/boot/dts/ core-y += arch/arc/plat-sim/ core-$(CONFIG_ARC_PLAT_TB10X) += arch/arc/plat-tb10x/ core-$(CONFIG_ARC_PLAT_AXS10X) += arch/arc/plat-axs10x/ -core-$(CONFIG_ARC_PLAT_EZNPS) += arch/arc/plat-eznps/ core-$(CONFIG_ARC_SOC_HSDK) += arch/arc/plat-hsdk/ -ifdef CONFIG_ARC_PLAT_EZNPS -KBUILD_CPPFLAGS += -I$(srctree)/arch/arc/plat-eznps/include -endif - drivers-$(CONFIG_OPROFILE) += arch/arc/oprofile/ libs-y += arch/arc/lib/ $(LIBGCC) diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi index 79ec27c043c1..2a151607b080 100644 --- a/arch/arc/boot/dts/axc001.dtsi +++ b/arch/arc/boot/dts/axc001.dtsi @@ -91,7 +91,7 @@ * avoid duplicating the MB dtsi file given that IRQ from * this intc to cpu intc are different for axs101 and axs103 */ - mb_intc: dw-apb-ictl@e0012000 { + mb_intc: interrupt-controller@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0x0 0xe0012000 0x0 0x200 >; diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index ac8e1b463a70..cd1edcf4f95e 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -129,7 +129,7 @@ * avoid duplicating the MB dtsi file given that IRQ from * this intc to cpu intc are different for axs101 and axs103 */ - mb_intc: dw-apb-ictl@e0012000 { + mb_intc: interrupt-controller@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0x0 0xe0012000 0x0 0x200 >; diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 9da21e7fd246..70779386ca79 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -135,7 +135,7 @@ * avoid duplicating the MB dtsi file given that IRQ from * this intc to cpu intc are different for axs101 and axs103 */ - mb_intc: dw-apb-ictl@e0012000 { + mb_intc: interrupt-controller@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0x0 0xe0012000 0x0 0x200 >; diff --git a/arch/arc/boot/dts/eznps.dts b/arch/arc/boot/dts/eznps.dts deleted file mode 100644 index a7e2e8d8ff06..000000000000 --- a/arch/arc/boot/dts/eznps.dts +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -/dts-v1/; - -/ { - compatible = "ezchip,arc-nps"; - #address-cells = <1>; - #size-cells = <1>; - interrupt-parent = <&intc>; - present-cpus = "0-1,16-17"; - possible-cpus = "0-4095"; - - aliases { - ethernet0 = &gmac0; - }; - - chosen { - bootargs = "earlycon=uart8250,mmio32be,0xf7209000,115200n8 console=ttyS0,115200n8"; - }; - - memory { - device_type = "memory"; - reg = <0x80000000 0x20000000>; /* 512M */ - }; - - clocks { - sysclk: sysclk { - compatible = "fixed-clock"; - #clock-cells = <0>; - clock-frequency = <83333333>; - }; - }; - - soc { - compatible = "simple-bus"; - #address-cells = <1>; - #size-cells = <1>; - - /* child and parent address space 1:1 mapped */ - ranges; - - intc: interrupt-controller { - compatible = "ezchip,nps400-ic"; - interrupt-controller; - #interrupt-cells = <1>; - }; - - timer0: timer_clkevt { - compatible = "snps,arc-timer"; - interrupts = <3>; - clocks = <&sysclk>; - }; - - timer1: timer_clksrc { - compatible = "ezchip,nps400-timer"; - clocks = <&sysclk>; - clock-names="sysclk"; - }; - - uart@f7209000 { - compatible = "snps,dw-apb-uart"; - device_type = "serial"; - reg = <0xf7209000 0x100>; - interrupts = <6>; - clocks = <&sysclk>; - clock-names="baudclk"; - baud = <115200>; - reg-shift = <2>; - reg-io-width = <4>; - native-endian; - }; - - gmac0: ethernet@f7470000 { - compatible = "ezchip,nps-mgt-enet"; - reg = <0xf7470000 0x1940>; - interrupts = <7>; - /* Filled in by U-Boot */ - mac-address = [ 00 C0 00 F0 04 03 ]; - }; - }; -}; diff --git a/arch/arc/boot/dts/vdk_axc003.dtsi b/arch/arc/boot/dts/vdk_axc003.dtsi index f8be7ba8dad4..c21d0eb07bf6 100644 --- a/arch/arc/boot/dts/vdk_axc003.dtsi +++ b/arch/arc/boot/dts/vdk_axc003.dtsi @@ -46,7 +46,7 @@ }; - mb_intc: dw-apb-ictl@e0012000 { + mb_intc: interrupt-controller@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0xe0012000 0x200 >; diff --git a/arch/arc/boot/dts/vdk_axc003_idu.dtsi b/arch/arc/boot/dts/vdk_axc003_idu.dtsi index 0afa3e53a4e3..4d348853ac7c 100644 --- a/arch/arc/boot/dts/vdk_axc003_idu.dtsi +++ b/arch/arc/boot/dts/vdk_axc003_idu.dtsi @@ -54,7 +54,7 @@ }; - mb_intc: dw-apb-ictl@e0012000 { + mb_intc: interrupt-controller@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0xe0012000 0x200 >; diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig deleted file mode 100644 index f7a978dfdf1d..000000000000 --- a/arch/arc/configs/nps_defconfig +++ /dev/null @@ -1,80 +0,0 @@ -# CONFIG_LOCALVERSION_AUTO is not set -# CONFIG_SWAP is not set -CONFIG_SYSVIPC=y -CONFIG_NO_HZ_IDLE=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y -# CONFIG_EPOLL is not set -# CONFIG_SIGNALFD is not set -# CONFIG_TIMERFD is not set -# CONFIG_EVENTFD is not set -# CONFIG_AIO is not set -CONFIG_EMBEDDED=y -CONFIG_PERF_EVENTS=y -# CONFIG_COMPAT_BRK is not set -CONFIG_ISA_ARCOMPACT=y -CONFIG_KPROBES=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_ARC_PLAT_EZNPS=y -CONFIG_SMP=y -CONFIG_NR_CPUS=4096 -CONFIG_ARC_CACHE_LINE_SHIFT=5 -# CONFIG_ARC_CACHE_PAGES is not set -# CONFIG_ARC_HAS_LLSC is not set -CONFIG_ARC_KVADDR_SIZE=402 -CONFIG_ARC_EMUL_UNALIGNED=y -CONFIG_PREEMPT=y -CONFIG_NET=y -CONFIG_UNIX=y -CONFIG_INET=y -CONFIG_IP_PNP=y -# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -# CONFIG_INET_XFRM_MODE_TUNNEL is not set -# CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_DIAG is not set -# CONFIG_IPV6 is not set -# CONFIG_WIRELESS is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -# CONFIG_PREVENT_FIRMWARE_BUILD is not set -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_COUNT=1 -CONFIG_BLK_DEV_RAM_SIZE=2048 -CONFIG_NETDEVICES=y -CONFIG_NETCONSOLE=y -# CONFIG_NET_VENDOR_BROADCOM is not set -# CONFIG_NET_VENDOR_MICREL is not set -# CONFIG_NET_VENDOR_STMICRO is not set -# CONFIG_WLAN is not set -# CONFIG_INPUT_MOUSEDEV is not set -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -# CONFIG_LEGACY_PTYS is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_NR_UARTS=1 -CONFIG_SERIAL_8250_RUNTIME_UARTS=1 -CONFIG_SERIAL_8250_DW=y -CONFIG_SERIAL_OF_PLATFORM=y -# CONFIG_HW_RANDOM is not set -# CONFIG_HWMON is not set -# CONFIG_USB_SUPPORT is not set -# CONFIG_DNOTIFY is not set -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -# CONFIG_MISC_FILESYSTEMS is not set -CONFIG_NFS_FS=y -CONFIG_NFS_V3_ACL=y -CONFIG_ROOT_NFS=y -CONFIG_DEBUG_INFO=y -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_MEMORY_INIT=y -CONFIG_ENABLE_DEFAULT_TRACERS=y diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index c614857eb209..5afc79c9b2f5 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -14,8 +14,6 @@ #include <asm/barrier.h> #include <asm/smp.h> -#ifndef CONFIG_ARC_PLAT_EZNPS - #define atomic_read(v) READ_ONCE((v)->counter) #ifdef CONFIG_ARC_HAS_LLSC @@ -45,7 +43,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \ \ /* \ * Explicit full memory barrier needed before/after as \ - * LLOCK/SCOND thmeselves don't provide any such semantics \ + * LLOCK/SCOND themselves don't provide any such semantics \ */ \ smp_mb(); \ \ @@ -71,7 +69,7 @@ static inline int atomic_fetch_##op(int i, atomic_t *v) \ \ /* \ * Explicit full memory barrier needed before/after as \ - * LLOCK/SCOND thmeselves don't provide any such semantics \ + * LLOCK/SCOND themselves don't provide any such semantics \ */ \ smp_mb(); \ \ @@ -195,108 +193,6 @@ ATOMIC_OPS(andnot, &= ~, bic) ATOMIC_OPS(or, |=, or) ATOMIC_OPS(xor, ^=, xor) -#else /* CONFIG_ARC_PLAT_EZNPS */ - -static inline int atomic_read(const atomic_t *v) -{ - int temp; - - __asm__ __volatile__( - " ld.di %0, [%1]" - : "=r"(temp) - : "r"(&v->counter) - : "memory"); - return temp; -} - -static inline void atomic_set(atomic_t *v, int i) -{ - __asm__ __volatile__( - " st.di %0,[%1]" - : - : "r"(i), "r"(&v->counter) - : "memory"); -} - -#define ATOMIC_OP(op, c_op, asm_op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - __asm__ __volatile__( \ - " mov r2, %0\n" \ - " mov r3, %1\n" \ - " .word %2\n" \ - : \ - : "r"(i), "r"(&v->counter), "i"(asm_op) \ - : "r2", "r3", "memory"); \ -} \ - -#define ATOMIC_OP_RETURN(op, c_op, asm_op) \ -static inline int atomic_##op##_return(int i, atomic_t *v) \ -{ \ - unsigned int temp = i; \ - \ - /* Explicit full memory barrier needed before/after */ \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - " mov r2, %0\n" \ - " mov r3, %1\n" \ - " .word %2\n" \ - " mov %0, r2" \ - : "+r"(temp) \ - : "r"(&v->counter), "i"(asm_op) \ - : "r2", "r3", "memory"); \ - \ - smp_mb(); \ - \ - temp c_op i; \ - \ - return temp; \ -} - -#define ATOMIC_FETCH_OP(op, c_op, asm_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - unsigned int temp = i; \ - \ - /* Explicit full memory barrier needed before/after */ \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - " mov r2, %0\n" \ - " mov r3, %1\n" \ - " .word %2\n" \ - " mov %0, r2" \ - : "+r"(temp) \ - : "r"(&v->counter), "i"(asm_op) \ - : "r2", "r3", "memory"); \ - \ - smp_mb(); \ - \ - return temp; \ -} - -#define ATOMIC_OPS(op, c_op, asm_op) \ - ATOMIC_OP(op, c_op, asm_op) \ - ATOMIC_OP_RETURN(op, c_op, asm_op) \ - ATOMIC_FETCH_OP(op, c_op, asm_op) - -ATOMIC_OPS(add, +=, CTOP_INST_AADD_DI_R2_R2_R3) -#define atomic_sub(i, v) atomic_add(-(i), (v)) -#define atomic_sub_return(i, v) atomic_add_return(-(i), (v)) -#define atomic_fetch_sub(i, v) atomic_fetch_add(-(i), (v)) - -#undef ATOMIC_OPS -#define ATOMIC_OPS(op, c_op, asm_op) \ - ATOMIC_OP(op, c_op, asm_op) \ - ATOMIC_FETCH_OP(op, c_op, asm_op) - -ATOMIC_OPS(and, &=, CTOP_INST_AAND_DI_R2_R2_R3) -ATOMIC_OPS(or, |=, CTOP_INST_AOR_DI_R2_R2_R3) -ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3) - -#endif /* CONFIG_ARC_PLAT_EZNPS */ - #undef ATOMIC_OPS #undef ATOMIC_FETCH_OP #undef ATOMIC_OP_RETURN diff --git a/arch/arc/include/asm/barrier.h b/arch/arc/include/asm/barrier.h index 7823811e7cf5..4637de9e02fa 100644 --- a/arch/arc/include/asm/barrier.h +++ b/arch/arc/include/asm/barrier.h @@ -27,7 +27,7 @@ #define rmb() asm volatile("dmb 1\n" : : : "memory") #define wmb() asm volatile("dmb 2\n" : : : "memory") -#elif !defined(CONFIG_ARC_PLAT_EZNPS) /* CONFIG_ISA_ARCOMPACT */ +#else /* * ARCompact based cores (ARC700) only have SYNC instruction which is super @@ -37,13 +37,6 @@ #define mb() asm volatile("sync\n" : : : "memory") -#else /* CONFIG_ARC_PLAT_EZNPS */ - -#include <plat/ctop.h> - -#define mb() asm volatile (".word %0" : : "i"(CTOP_INST_SCHD_RW) : "memory") -#define rmb() asm volatile (".word %0" : : "i"(CTOP_INST_SCHD_RD) : "memory") - #endif #include <asm-generic/barrier.h> diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index 50eb3f64a77c..c6606f4d20d6 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -85,7 +85,7 @@ static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long * return (old & (1 << nr)) != 0; \ } -#elif !defined(CONFIG_ARC_PLAT_EZNPS) +#else /* !CONFIG_ARC_HAS_LLSC */ /* * Non hardware assisted Atomic-R-M-W @@ -136,55 +136,7 @@ static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long * return (old & (1UL << (nr & 0x1f))) != 0; \ } -#else /* CONFIG_ARC_PLAT_EZNPS */ - -#define BIT_OP(op, c_op, asm_op) \ -static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - m += nr >> 5; \ - \ - nr = (1UL << (nr & 0x1f)); \ - if (asm_op == CTOP_INST_AAND_DI_R2_R2_R3) \ - nr = ~nr; \ - \ - __asm__ __volatile__( \ - " mov r2, %0\n" \ - " mov r3, %1\n" \ - " .word %2\n" \ - : \ - : "r"(nr), "r"(m), "i"(asm_op) \ - : "r2", "r3", "memory"); \ -} - -#define TEST_N_BIT_OP(op, c_op, asm_op) \ -static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\ -{ \ - unsigned long old; \ - \ - m += nr >> 5; \ - \ - nr = old = (1UL << (nr & 0x1f)); \ - if (asm_op == CTOP_INST_AAND_DI_R2_R2_R3) \ - old = ~old; \ - \ - /* Explicit full memory barrier needed before/after */ \ - smp_mb(); \ - \ - __asm__ __volatile__( \ - " mov r2, %0\n" \ - " mov r3, %1\n" \ - " .word %2\n" \ - " mov %0, r2" \ - : "+r"(old) \ - : "r"(m), "i"(asm_op) \ - : "r2", "r3", "memory"); \ - \ - smp_mb(); \ - \ - return (old & nr) != 0; \ -} - -#endif /* CONFIG_ARC_PLAT_EZNPS */ +#endif /*************************************** * Non atomic variants @@ -226,15 +178,9 @@ static inline int __test_and_##op##_bit(unsigned long nr, volatile unsigned long /* __test_and_set_bit(), __test_and_clear_bit(), __test_and_change_bit() */\ __TEST_N_BIT_OP(op, c_op, asm_op) -#ifndef CONFIG_ARC_PLAT_EZNPS BIT_OPS(set, |, bset) BIT_OPS(clear, & ~, bclr) BIT_OPS(change, ^, bxor) -#else -BIT_OPS(set, |, CTOP_INST_AOR_DI_R2_R2_R3) -BIT_OPS(clear, & ~, CTOP_INST_AAND_DI_R2_R2_R3) -BIT_OPS(change, ^, CTOP_INST_AXOR_DI_R2_R2_R3) -#endif /* * This routine doesn't need to be atomic. diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index c11398160240..9b87e162e539 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -20,7 +20,7 @@ __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) /* * Explicit full memory barrier needed before/after as - * LLOCK/SCOND thmeselves don't provide any such semantics + * LLOCK/SCOND themselves don't provide any such semantics */ smp_mb(); @@ -41,7 +41,7 @@ __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) return prev; } -#elif !defined(CONFIG_ARC_PLAT_EZNPS) +#else /* !CONFIG_ARC_HAS_LLSC */ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) @@ -61,33 +61,7 @@ __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) return prev; } -#else /* CONFIG_ARC_PLAT_EZNPS */ - -static inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) -{ - /* - * Explicit full memory barrier needed before/after - */ - smp_mb(); - - write_aux_reg(CTOP_AUX_GPA1, expected); - - __asm__ __volatile__( - " mov r2, %0\n" - " mov r3, %1\n" - " .word %2\n" - " mov %0, r2" - : "+r"(new) - : "r"(ptr), "i"(CTOP_INST_EXC_DI_R2_R2_R3) - : "r2", "r3", "memory"); - - smp_mb(); - - return new; -} - -#endif /* CONFIG_ARC_HAS_LLSC */ +#endif #define cmpxchg(ptr, o, n) ({ \ (typeof(*(ptr)))__cmpxchg((ptr), \ @@ -104,8 +78,6 @@ __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new) #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n))) -#ifndef CONFIG_ARC_PLAT_EZNPS - /* * xchg (reg with memory) based on "Native atomic" EX insn */ @@ -168,44 +140,6 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, #endif -#else /* CONFIG_ARC_PLAT_EZNPS */ - -static inline unsigned long __xchg(unsigned long val, volatile void *ptr, - int size) -{ - extern unsigned long __xchg_bad_pointer(void); - - switch (size) { - case 4: - /* - * Explicit full memory barrier needed before/after - */ - smp_mb(); - - __asm__ __volatile__( - " mov r2, %0\n" - " mov r3, %1\n" - " .word %2\n" - " mov %0, r2\n" - : "+r"(val) - : "r"(ptr), "i"(CTOP_INST_XEX_DI_R2_R2_R3) - : "r2", "r3", "memory"); - - smp_mb(); - - return val; - } - return __xchg_bad_pointer(); -} - -#define xchg(ptr, with) ({ \ - (typeof(*(ptr)))__xchg((unsigned long)(with), \ - (ptr), \ - sizeof(*(ptr))); \ -}) - -#endif /* CONFIG_ARC_PLAT_EZNPS */ - /* * "atomic" variant of xchg() * REQ: It needs to follow the same serialization rules as other atomic_xxx() diff --git a/arch/arc/include/asm/entry-compact.h b/arch/arc/include/asm/entry-compact.h index c3aa775878dc..6dbf5cecc8cc 100644 --- a/arch/arc/include/asm/entry-compact.h +++ b/arch/arc/include/asm/entry-compact.h @@ -33,10 +33,6 @@ #include <asm/irqflags-compact.h> #include <asm/thread_info.h> /* For THREAD_SIZE */ -#ifdef CONFIG_ARC_PLAT_EZNPS -#include <plat/ctop.h> -#endif - /*-------------------------------------------------------------- * Switch to Kernel Mode stack if SP points to User Mode stack * @@ -189,12 +185,6 @@ PUSHAX lp_start PUSHAX erbta -#ifdef CONFIG_ARC_PLAT_EZNPS - .word CTOP_INST_SCHD_RW - PUSHAX CTOP_AUX_GPA1 - PUSHAX CTOP_AUX_EFLAGS -#endif - lr r10, [ecr] st r10, [sp, PT_event] /* EV_Trap expects r10 to have ECR */ .endm @@ -211,11 +201,6 @@ * by hardware and that is not good. *-------------------------------------------------------------*/ .macro EXCEPTION_EPILOGUE -#ifdef CONFIG_ARC_PLAT_EZNPS - .word CTOP_INST_SCHD_RW - POPAX CTOP_AUX_EFLAGS - POPAX CTOP_AUX_GPA1 -#endif POPAX erbta POPAX lp_start @@ -278,11 +263,6 @@ PUSHAX lp_start PUSHAX bta_l\LVL\() -#ifdef CONFIG_ARC_PLAT_EZNPS - .word CTOP_INST_SCHD_RW - PUSHAX CTOP_AUX_GPA1 - PUSHAX CTOP_AUX_EFLAGS -#endif .endm /*-------------------------------------------------------------- @@ -295,11 +275,6 @@ * by hardware and that is not good. *-------------------------------------------------------------*/ .macro INTERRUPT_EPILOGUE LVL -#ifdef CONFIG_ARC_PLAT_EZNPS - .word CTOP_INST_SCHD_RW - POPAX CTOP_AUX_EFLAGS - POPAX CTOP_AUX_GPA1 -#endif POPAX bta_l\LVL\() POPAX lp_start @@ -327,13 +302,11 @@ bic \reg, sp, (THREAD_SIZE - 1) .endm -#ifndef CONFIG_ARC_PLAT_EZNPS /* Get CPU-ID of this core */ .macro GET_CPU_ID reg lr \reg, [identity] lsr \reg, \reg, 8 bmsk \reg, \reg, 7 .endm -#endif #endif /* __ASM_ARC_ENTRY_COMPACT_H */ diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 0fcea5bad343..e4031ecd3c8c 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -17,13 +17,6 @@ #include <asm/dsp.h> #include <asm/fpu.h> -#ifdef CONFIG_ARC_PLAT_EZNPS -struct eznps_dp { - unsigned int eflags; - unsigned int gpa1; -}; -#endif - /* Arch specific stuff which needs to be saved per task. * However these items are not so important so as to earn a place in * struct thread_info @@ -38,9 +31,6 @@ struct thread_struct { #ifdef CONFIG_ARC_FPU_SAVE_RESTORE struct arc_fpu fpu; #endif -#ifdef CONFIG_ARC_PLAT_EZNPS - struct eznps_dp dp; -#endif }; #define INIT_THREAD { \ @@ -60,17 +50,8 @@ struct task_struct; * A lot of busy-wait loops in SMP are based off of non-volatile data otherwise * get optimised away by gcc */ -#ifndef CONFIG_EZNPS_MTM_EXT - #define cpu_relax() barrier() -#else - -#define cpu_relax() \ - __asm__ __volatile__ (".word %0" : : "i"(CTOP_INST_SCHD_RW) : "memory") - -#endif - #define KSTK_EIP(tsk) (task_pt_regs(tsk)->ret) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp) @@ -118,25 +99,7 @@ extern unsigned int get_wchan(struct task_struct *p); #define USER_KERNEL_GUTTER (VMALLOC_START - TASK_SIZE) -#ifdef CONFIG_ARC_PLAT_EZNPS -/* NPS architecture defines special window of 129M in user address space for - * special memory areas, when accessing this window the MMU do not use TLB. - * Instead MMU direct the access to: - * 0x57f00000:0x57ffffff -- 1M of closely coupled memory (aka CMEM) - * 0x58000000:0x5fffffff -- 16 huge pages, 8M each, with fixed map (aka FMTs) - * - * CMEM - is the fastest memory we got and its size is 16K. - * FMT - is used to map either to internal/external memory. - * Internal memory is the second fast memory and its size is 16M - * External memory is the biggest memory (16G) and also the slowest. - * - * STACK_TOP need to be PMD align (21bit) that is why we supply 0x57e00000. - */ -#define STACK_TOP 0x57e00000 -#else #define STACK_TOP TASK_SIZE -#endif - #define STACK_TOP_MAX STACK_TOP /* This decides where the kernel will search for a free chunk of vm diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h index 2fdb87addadc..4c3c9be5bd16 100644 --- a/arch/arc/include/asm/ptrace.h +++ b/arch/arc/include/asm/ptrace.h @@ -16,11 +16,6 @@ #ifdef CONFIG_ISA_ARCOMPACT struct pt_regs { -#ifdef CONFIG_ARC_PLAT_EZNPS - unsigned long eflags; /* Extended FLAGS */ - unsigned long gpa1; /* General Purpose Aux */ -#endif - /* Real registers */ unsigned long bta; /* bta_l1, bta_l2, erbta */ diff --git a/arch/arc/include/asm/setup.h b/arch/arc/include/asm/setup.h index 61a97fe70b86..01f85478170d 100644 --- a/arch/arc/include/asm/setup.h +++ b/arch/arc/include/asm/setup.h @@ -9,11 +9,7 @@ #include <linux/types.h> #include <uapi/asm/setup.h> -#ifdef CONFIG_ARC_PLAT_EZNPS -#define COMMAND_LINE_SIZE 2048 -#else #define COMMAND_LINE_SIZE 256 -#endif /* * Data structure to map a ID to string diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 94bbed88e3fc..192871608925 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -232,15 +232,9 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) __asm__ __volatile__( "1: ex %0, [%1] \n" -#ifdef CONFIG_EZNPS_MTM_EXT - " .word %3 \n" -#endif " breq %0, %2, 1b \n" : "+&r" (val) : "r"(&(lock->slock)), "ir"(__ARCH_SPIN_LOCK_LOCKED__) -#ifdef CONFIG_EZNPS_MTM_EXT - , "i"(CTOP_INST_SCHD_RW) -#endif : "memory"); smp_mb(); diff --git a/arch/arc/include/asm/switch_to.h b/arch/arc/include/asm/switch_to.h index 4a3d67989d19..1f85de8288b1 100644 --- a/arch/arc/include/asm/switch_to.h +++ b/arch/arc/include/asm/switch_to.h @@ -12,19 +12,10 @@ #include <asm/dsp-impl.h> #include <asm/fpu.h> -#ifdef CONFIG_ARC_PLAT_EZNPS -extern void dp_save_restore(struct task_struct *p, struct task_struct *n); -#define ARC_EZNPS_DP_PREV(p, n) dp_save_restore(p, n) -#else -#define ARC_EZNPS_DP_PREV(p, n) - -#endif /* !CONFIG_ARC_PLAT_EZNPS */ - struct task_struct *__switch_to(struct task_struct *p, struct task_struct *n); #define switch_to(prev, next, last) \ do { \ - ARC_EZNPS_DP_PREV(prev, next); \ dsp_save_restore(prev, next); \ fpu_save_restore(prev, next); \ last = __switch_to(prev, next);\ diff --git a/arch/arc/kernel/ctx_sw.c b/arch/arc/kernel/ctx_sw.c index e172c3333a84..1a76f2d6f694 100644 --- a/arch/arc/kernel/ctx_sw.c +++ b/arch/arc/kernel/ctx_sw.c @@ -14,9 +14,6 @@ #include <asm/asm-offsets.h> #include <linux/sched.h> #include <linux/sched/debug.h> -#ifdef CONFIG_ARC_PLAT_EZNPS -#include <plat/ctop.h> -#endif #define KSP_WORD_OFF ((TASK_THREAD + THREAD_KSP) / 4) @@ -68,16 +65,9 @@ __switch_to(struct task_struct *prev_task, struct task_struct *next_task) #ifndef CONFIG_SMP "st %2, [@_current_task] \n\t" #else -#ifdef CONFIG_ARC_PLAT_EZNPS - "lr r24, [%4] \n\t" -#ifndef CONFIG_EZNPS_MTM_EXT - "lsr r24, r24, 4 \n\t" -#endif -#else "lr r24, [identity] \n\t" "lsr r24, r24, 8 \n\t" "bmsk r24, r24, 7 \n\t" -#endif "add2 r24, @_current_task, r24 \n\t" "st %2, [r24] \n\t" #endif @@ -115,9 +105,6 @@ __switch_to(struct task_struct *prev_task, struct task_struct *next_task) : "=r"(tmp) : "n"(KSP_WORD_OFF), "r"(next), "r"(prev) -#ifdef CONFIG_ARC_PLAT_EZNPS - , "i"(CTOP_AUX_LOGIC_GLOBAL_ID) -#endif : "blink" ); diff --git a/arch/arc/kernel/devtree.c b/arch/arc/kernel/devtree.c index fa86d13df5ed..721d465f1580 100644 --- a/arch/arc/kernel/devtree.c +++ b/arch/arc/kernel/devtree.c @@ -29,8 +29,6 @@ static void __init arc_set_early_base_baud(unsigned long dt_root) else if (of_flat_dt_is_compatible(dt_root, "snps,arc-sdp") || of_flat_dt_is_compatible(dt_root, "snps,hsdk")) arc_base_baud = 33333333; /* Fixed 33MHz clk (AXS10x & HSDK) */ - else if (of_flat_dt_is_compatible(dt_root, "ezchip,arc-nps")) - arc_base_baud = 800000000; /* Fixed 800MHz clk (NPS) */ else arc_base_baud = 50000000; /* Fixed default 50MHz */ } diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 79849f37e782..145722f80c9b 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -562,7 +562,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) { struct arc_reg_pct_build pct_bcr; struct arc_reg_cc_build cc_bcr; - int i, has_interrupts, irq; + int i, has_interrupts, irq = -1; int counter_size; /* in bits */ union cc_name { @@ -637,19 +637,28 @@ static int arc_pmu_device_probe(struct platform_device *pdev) .attr_groups = arc_pmu->attr_groups, }; - if (has_interrupts && (irq = platform_get_irq(pdev, 0) >= 0)) { + if (has_interrupts) { + irq = platform_get_irq(pdev, 0); + if (irq >= 0) { + int ret; - arc_pmu->irq = irq; + arc_pmu->irq = irq; - /* intc map function ensures irq_set_percpu_devid() called */ - request_percpu_irq(irq, arc_pmu_intr, "ARC perf counters", - this_cpu_ptr(&arc_pmu_cpu)); + /* intc map function ensures irq_set_percpu_devid() called */ + ret = request_percpu_irq(irq, arc_pmu_intr, "ARC perf counters", + this_cpu_ptr(&arc_pmu_cpu)); + + if (!ret) + on_each_cpu(arc_cpu_pmu_irq_init, &irq, 1); + else + irq = -1; + } - on_each_cpu(arc_cpu_pmu_irq_init, &irq, 1); - } else { - arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } + if (irq == -1) + arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + /* * perf parser doesn't really like '-' symbol in events name, so let's * use '_' in arc pct name as it goes to kernel PMU event prefix. diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index efeba1fe7252..37f724ad5e39 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -116,17 +116,6 @@ void arch_cpu_idle(void) :"I"(arg)); /* can't be "r" has to be embedded const */ } -#elif defined(CONFIG_EZNPS_MTM_EXT) /* ARC700 variant in NPS */ - -void arch_cpu_idle(void) -{ - /* only the calling HW thread needs to sleep */ - __asm__ __volatile__( - ".word %0 \n" - : - :"i"(CTOP_INST_HWSCHD_WFT_IE12)); -} - #else /* ARC700 */ void arch_cpu_idle(void) @@ -278,10 +267,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long usp) */ regs->status32 = STATUS_U_MASK | STATUS_L_MASK | ISA_INIT_STATUS_BITS; -#ifdef CONFIG_EZNPS_MTM_EXT - regs->eflags = 0; -#endif - fpu_init_task(regs); /* bogus seed values for debugging */ diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index 8222f8c54690..2be55fb96d87 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -394,6 +394,6 @@ void do_notify_resume(struct pt_regs *regs) * ASM glue gaurantees that this is only called when returning to * user mode */ - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index eca35e02ce06..52906d314537 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -226,7 +226,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) } if (!cpu_online(cpu)) { - pr_info("Timeout: CPU%u FAILED to comeup !!!\n", cpu); + pr_info("Timeout: CPU%u FAILED to come up !!!\n", cpu); return -1; } diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index e947572a521e..517988e60cfc 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -3,7 +3,7 @@ * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) */ -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <asm/cache.h> #include <asm/cacheflush.h> diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S index 31f54bdd95f2..062fae46c3f8 100644 --- a/arch/arc/mm/tlbex.S +++ b/arch/arc/mm/tlbex.S @@ -281,13 +281,6 @@ ex_saved_reg1: .macro COMMIT_ENTRY_TO_MMU #if (CONFIG_ARC_MMU_VER < 4) -#ifdef CONFIG_EZNPS_MTM_EXT - /* verify if entry for this vaddr+ASID already exists */ - sr TLBProbe, [ARC_REG_TLBCOMMAND] - lr r0, [ARC_REG_TLBINDEX] - bbit0 r0, 31, 88f -#endif - /* Get free TLB slot: Set = computed from vaddr, way = random */ sr TLBGetIndex, [ARC_REG_TLBCOMMAND] diff --git a/arch/arc/plat-eznps/Kconfig b/arch/arc/plat-eznps/Kconfig deleted file mode 100644 index a645bca5899a..000000000000 --- a/arch/arc/plat-eznps/Kconfig +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.rst. -# - -menuconfig ARC_PLAT_EZNPS - bool "\"EZchip\" ARC dev platform" - depends on ISA_ARCOMPACT - select CPU_BIG_ENDIAN - select CLKSRC_NPS if !PHYS_ADDR_T_64BIT - select EZNPS_GIC - select EZCHIP_NPS_MANAGEMENT_ENET if ETHERNET - help - Support for EZchip development platforms, - based on ARC700 cores. - We handle few flavors: - - Hardware Emulator AKA HE which is FPGA based chassis - - Simulator based on MetaWare nSIM - - NPS400 chip based on ASIC - -config EZNPS_MTM_EXT - bool "ARC-EZchip MTM Extensions" - select CPUMASK_OFFSTACK - depends on ARC_PLAT_EZNPS && SMP - default y - help - Here we add new hierarchy for CPUs topology. - We got: - Core - Thread - At the new thread level each CPU represent one HW thread. - At highest hierarchy each core contain 16 threads, - any of them seem like CPU from Linux point of view. - All threads within same core share the execution unit of the - core and HW scheduler round robin between them. - -config EZNPS_MEM_ERROR_ALIGN - bool "ARC-EZchip Memory error as an exception" - depends on EZNPS_MTM_EXT - default n - help - On the real chip of the NPS, user memory errors are handled - as a machine check exception, which is fatal, whereas on - simulator platform for NPS, is handled as a Level 2 interrupt - (just a stock ARC700) which is recoverable. This option makes - simulator behave like hardware. - -config EZNPS_SHARED_AUX_REGS - bool "ARC-EZchip Shared Auxiliary Registers Per Core" - depends on ARC_PLAT_EZNPS - default y - help - On the real chip of the NPS, auxiliary registers are shared between - all the cpus of the core, whereas on simulator platform for NPS, - each cpu has a different set of auxiliary registers. Configuration - should be unset if auxiliary registers are not shared between the cpus - of the core, so there will be a need to initialize them per cpu. diff --git a/arch/arc/plat-eznps/Makefile b/arch/arc/plat-eznps/Makefile deleted file mode 100644 index ebb9723002cf..000000000000 --- a/arch/arc/plat-eznps/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile for the linux kernel. -# - -obj-y := entry.o platform.o ctop.o -obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_EZNPS_MTM_EXT) += mtm.o diff --git a/arch/arc/plat-eznps/ctop.c b/arch/arc/plat-eznps/ctop.c deleted file mode 100644 index b398e6e838a9..000000000000 --- a/arch/arc/plat-eznps/ctop.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#include <linux/sched.h> -#include <asm/processor.h> -#include <plat/ctop.h> - -void dp_save_restore(struct task_struct *prev, struct task_struct *next) -{ - struct eznps_dp *prev_task_dp = &prev->thread.dp; - struct eznps_dp *next_task_dp = &next->thread.dp; - - /* Here we save all Data Plane related auxiliary registers */ - prev_task_dp->eflags = read_aux_reg(CTOP_AUX_EFLAGS); - write_aux_reg(CTOP_AUX_EFLAGS, next_task_dp->eflags); - - prev_task_dp->gpa1 = read_aux_reg(CTOP_AUX_GPA1); - write_aux_reg(CTOP_AUX_GPA1, next_task_dp->gpa1); -} diff --git a/arch/arc/plat-eznps/entry.S b/arch/arc/plat-eznps/entry.S deleted file mode 100644 index 3f18c0108e72..000000000000 --- a/arch/arc/plat-eznps/entry.S +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/******************************************************************************* - - EZNPS CPU startup Code - Copyright(c) 2012 EZchip Technologies. - - -*******************************************************************************/ -#include <linux/linkage.h> -#include <asm/entry.h> -#include <asm/cache.h> -#include <plat/ctop.h> - - .cpu A7 - - .section .init.text, "ax",@progbits - .align 1024 ; HW requierment for restart first PC - -ENTRY(res_service) -#if defined(CONFIG_EZNPS_MTM_EXT) && defined(CONFIG_EZNPS_SHARED_AUX_REGS) - ; There is no work for HW thread id != 0 - lr r3, [CTOP_AUX_THREAD_ID] - cmp r3, 0 - jne stext -#endif - -#ifdef CONFIG_ARC_HAS_DCACHE - ; With no cache coherency mechanism D$ need to be used very carefully. - ; Address space: - ; 0G-2G: We disable CONFIG_ARC_CACHE_PAGES. - ; 2G-3G: We disable D$ by setting this bit. - ; 3G-4G: D$ is disabled by architecture. - ; FMT are huge pages for user application reside at 0-2G. - ; Only FMT left as one who can use D$ where each such page got - ; disable/enable bit for cachability. - ; Programmer will use FMT pages for private data so cache coherency - ; would not be a problem. - ; First thing we invalidate D$ - sr 1, [ARC_REG_DC_IVDC] - sr HW_COMPLY_KRN_NOT_D_CACHED, [CTOP_AUX_HW_COMPLY] -#endif - -#ifdef CONFIG_SMP - ; We set logical cpuid to be used by GET_CPUID - ; We do not use physical cpuid since we want ids to be continious when - ; it comes to cpus on the same quad cluster. - ; This is useful for applications that used shared resources of a quad - ; cluster such SRAMS. - lr r3, [CTOP_AUX_CORE_ID] - sr r3, [CTOP_AUX_LOGIC_CORE_ID] - lr r3, [CTOP_AUX_CLUSTER_ID] - ; Set logical is acheived by swap of 2 middle bits of cluster id (4 bit) - ; r3 is used since we use short instruction and we need q-class reg - .short CTOP_INST_MOV2B_FLIP_R3_B1_B2_INST - .word CTOP_INST_MOV2B_FLIP_R3_B1_B2_LIMM - sr r3, [CTOP_AUX_LOGIC_CLUSTER_ID] -#endif - - j stext -END(res_service) diff --git a/arch/arc/plat-eznps/include/plat/ctop.h b/arch/arc/plat-eznps/include/plat/ctop.h deleted file mode 100644 index 77712c5ffe84..000000000000 --- a/arch/arc/plat-eznps/include/plat/ctop.h +++ /dev/null @@ -1,208 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#ifndef _PLAT_EZNPS_CTOP_H -#define _PLAT_EZNPS_CTOP_H - -#ifndef CONFIG_ARC_PLAT_EZNPS -#error "Incorrect ctop.h include" -#endif - -#include <linux/bits.h> -#include <linux/types.h> -#include <soc/nps/common.h> - -/* core auxiliary registers */ -#ifdef __ASSEMBLY__ -#define CTOP_AUX_BASE (-0x800) -#else -#define CTOP_AUX_BASE 0xFFFFF800 -#endif - -#define CTOP_AUX_GLOBAL_ID (CTOP_AUX_BASE + 0x000) -#define CTOP_AUX_CLUSTER_ID (CTOP_AUX_BASE + 0x004) -#define CTOP_AUX_CORE_ID (CTOP_AUX_BASE + 0x008) -#define CTOP_AUX_THREAD_ID (CTOP_AUX_BASE + 0x00C) -#define CTOP_AUX_LOGIC_GLOBAL_ID (CTOP_AUX_BASE + 0x010) -#define CTOP_AUX_LOGIC_CLUSTER_ID (CTOP_AUX_BASE + 0x014) -#define CTOP_AUX_LOGIC_CORE_ID (CTOP_AUX_BASE + 0x018) -#define CTOP_AUX_MT_CTRL (CTOP_AUX_BASE + 0x020) -#define CTOP_AUX_HW_COMPLY (CTOP_AUX_BASE + 0x024) -#define CTOP_AUX_DPC (CTOP_AUX_BASE + 0x02C) -#define CTOP_AUX_LPC (CTOP_AUX_BASE + 0x030) -#define CTOP_AUX_EFLAGS (CTOP_AUX_BASE + 0x080) -#define CTOP_AUX_GPA1 (CTOP_AUX_BASE + 0x08C) -#define CTOP_AUX_UDMC (CTOP_AUX_BASE + 0x300) - -/* EZchip core instructions */ -#define CTOP_INST_HWSCHD_WFT_IE12 0x3E6F7344 -#define CTOP_INST_HWSCHD_OFF_R4 0x3C6F00BF -#define CTOP_INST_HWSCHD_RESTORE_R4 0x3E6F7103 -#define CTOP_INST_SCHD_RW 0x3E6F7004 -#define CTOP_INST_SCHD_RD 0x3E6F7084 -#define CTOP_INST_ASRI_0_R3 0x3B56003E -#define CTOP_INST_XEX_DI_R2_R2_R3 0x4A664C00 -#define CTOP_INST_EXC_DI_R2_R2_R3 0x4A664C01 -#define CTOP_INST_AADD_DI_R2_R2_R3 0x4A664C02 -#define CTOP_INST_AAND_DI_R2_R2_R3 0x4A664C04 -#define CTOP_INST_AOR_DI_R2_R2_R3 0x4A664C05 -#define CTOP_INST_AXOR_DI_R2_R2_R3 0x4A664C06 - -/* Do not use D$ for address in 2G-3G */ -#define HW_COMPLY_KRN_NOT_D_CACHED BIT(28) - -#define NPS_MSU_EN_CFG 0x80 -#define NPS_CRG_BLKID 0x480 -#define NPS_CRG_SYNC_BIT BIT(0) -#define NPS_GIM_BLKID 0x5C0 - -/* GIM registers and fields*/ -#define NPS_GIM_UART_LINE BIT(7) -#define NPS_GIM_DBG_LAN_EAST_TX_DONE_LINE BIT(10) -#define NPS_GIM_DBG_LAN_EAST_RX_RDY_LINE BIT(11) -#define NPS_GIM_DBG_LAN_WEST_TX_DONE_LINE BIT(25) -#define NPS_GIM_DBG_LAN_WEST_RX_RDY_LINE BIT(26) - -#ifndef __ASSEMBLY__ -/* Functional registers definition */ -struct nps_host_reg_mtm_cfg { - union { - struct { - u32 gen:1, gdis:1, clk_gate_dis:1, asb:1, - __reserved:9, nat:3, ten:16; - }; - u32 value; - }; -}; - -struct nps_host_reg_mtm_cpu_cfg { - union { - struct { - u32 csa:22, dmsid:6, __reserved:3, cs:1; - }; - u32 value; - }; -}; - -struct nps_host_reg_thr_init { - union { - struct { - u32 str:1, __reserved:27, thr_id:4; - }; - u32 value; - }; -}; - -struct nps_host_reg_thr_init_sts { - union { - struct { - u32 bsy:1, err:1, __reserved:26, thr_id:4; - }; - u32 value; - }; -}; - -struct nps_host_reg_msu_en_cfg { - union { - struct { - u32 __reserved1:11, - rtc_en:1, ipc_en:1, gim_1_en:1, - gim_0_en:1, ipi_en:1, buff_e_rls_bmuw:1, - buff_e_alc_bmuw:1, buff_i_rls_bmuw:1, buff_i_alc_bmuw:1, - buff_e_rls_bmue:1, buff_e_alc_bmue:1, buff_i_rls_bmue:1, - buff_i_alc_bmue:1, __reserved2:1, buff_e_pre_en:1, - buff_i_pre_en:1, pmuw_ja_en:1, pmue_ja_en:1, - pmuw_nj_en:1, pmue_nj_en:1, msu_en:1; - }; - u32 value; - }; -}; - -struct nps_host_reg_gim_p_int_dst { - union { - struct { - u32 int_out_en:1, __reserved1:4, - is:1, intm:2, __reserved2:4, - nid:4, __reserved3:4, cid:4, - __reserved4:4, tid:4; - }; - u32 value; - }; -}; - -/* AUX registers definition */ -struct nps_host_reg_aux_dpc { - union { - struct { - u32 ien:1, men:1, hen:1, reserved:29; - }; - u32 value; - }; -}; - -struct nps_host_reg_aux_udmc { - union { - struct { - u32 dcp:1, cme:1, __reserved:19, nat:3, - __reserved2:5, dcas:3; - }; - u32 value; - }; -}; - -struct nps_host_reg_aux_mt_ctrl { - union { - struct { - u32 mten:1, hsen:1, scd:1, sten:1, - st_cnt:8, __reserved:8, - hs_cnt:8, __reserved1:4; - }; - u32 value; - }; -}; - -struct nps_host_reg_aux_hw_comply { - union { - struct { - u32 me:1, le:1, te:1, knc:1, __reserved:28; - }; - u32 value; - }; -}; - -struct nps_host_reg_aux_lpc { - union { - struct { - u32 mep:1, __reserved:31; - }; - u32 value; - }; -}; - -/* CRG registers */ -#define REG_GEN_PURP_0 nps_host_reg_non_cl(NPS_CRG_BLKID, 0x1BF) - -/* GIM registers */ -#define REG_GIM_P_INT_EN_0 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x100) -#define REG_GIM_P_INT_POL_0 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x110) -#define REG_GIM_P_INT_SENS_0 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x114) -#define REG_GIM_P_INT_BLK_0 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x118) -#define REG_GIM_P_INT_DST_10 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x13A) -#define REG_GIM_P_INT_DST_11 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x13B) -#define REG_GIM_P_INT_DST_25 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x149) -#define REG_GIM_P_INT_DST_26 nps_host_reg_non_cl(NPS_GIM_BLKID, 0x14A) - -#else - -.macro GET_CPU_ID reg - lr \reg, [CTOP_AUX_LOGIC_GLOBAL_ID] -#ifndef CONFIG_EZNPS_MTM_EXT - lsr \reg, \reg, 4 -#endif -.endm - -#endif /* __ASSEMBLY__ */ - -#endif /* _PLAT_EZNPS_CTOP_H */ diff --git a/arch/arc/plat-eznps/include/plat/mtm.h b/arch/arc/plat-eznps/include/plat/mtm.h deleted file mode 100644 index 7c55becc891b..000000000000 --- a/arch/arc/plat-eznps/include/plat/mtm.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#ifndef _PLAT_EZNPS_MTM_H -#define _PLAT_EZNPS_MTM_H - -#include <plat/ctop.h> - -static inline void *nps_mtm_reg_addr(u32 cpu, u32 reg) -{ - struct global_id gid; - u32 core, blkid; - - gid.value = cpu; - core = gid.core; - blkid = (((core & 0x0C) << 2) | (core & 0x03)); - - return nps_host_reg(cpu, blkid, reg); -} - -#ifdef CONFIG_EZNPS_MTM_EXT -#define NPS_CPU_TO_THREAD_NUM(cpu) \ - ({ struct global_id gid; gid.value = cpu; gid.thread; }) - -/* MTM registers */ -#define MTM_CFG(cpu) nps_mtm_reg_addr(cpu, 0x81) -#define MTM_THR_INIT(cpu) nps_mtm_reg_addr(cpu, 0x92) -#define MTM_THR_INIT_STS(cpu) nps_mtm_reg_addr(cpu, 0x93) - -#define get_thread(map) map.thread -#define eznps_max_cpus 4096 -#define eznps_cpus_per_cluster 256 - -void mtm_enable_core(unsigned int cpu); -int mtm_enable_thread(int cpu); -#else /* !CONFIG_EZNPS_MTM_EXT */ - -#define get_thread(map) 0 -#define eznps_max_cpus 256 -#define eznps_cpus_per_cluster 16 -#define mtm_enable_core(cpu) -#define mtm_enable_thread(cpu) 1 -#define NPS_CPU_TO_THREAD_NUM(cpu) 0 - -#endif /* CONFIG_EZNPS_MTM_EXT */ - -#endif /* _PLAT_EZNPS_MTM_H */ diff --git a/arch/arc/plat-eznps/include/plat/smp.h b/arch/arc/plat-eznps/include/plat/smp.h deleted file mode 100644 index e433f118bdca..000000000000 --- a/arch/arc/plat-eznps/include/plat/smp.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#ifndef __PLAT_EZNPS_SMP_H -#define __PLAT_EZNPS_SMP_H - -#ifdef CONFIG_SMP - -extern void res_service(void); - -#endif /* CONFIG_SMP */ - -#endif diff --git a/arch/arc/plat-eznps/mtm.c b/arch/arc/plat-eznps/mtm.c deleted file mode 100644 index 3dcf5a9e2976..000000000000 --- a/arch/arc/plat-eznps/mtm.c +++ /dev/null @@ -1,166 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/io.h> -#include <linux/log2.h> -#include <asm/arcregs.h> -#include <plat/mtm.h> -#include <plat/smp.h> - -#define MT_HS_CNT_MIN 0x01 -#define MT_HS_CNT_MAX 0xFF -#define MT_CTRL_ST_CNT 0xF -#define NPS_NUM_HW_THREADS 0x10 - -static int mtm_hs_ctr = MT_HS_CNT_MAX; - -#ifdef CONFIG_EZNPS_MEM_ERROR_ALIGN -int do_memory_error(unsigned long address, struct pt_regs *regs) -{ - die("Invalid Mem Access", regs, address); - - return 1; -} -#endif - -static void mtm_init_nat(int cpu) -{ - struct nps_host_reg_mtm_cfg mtm_cfg; - struct nps_host_reg_aux_udmc udmc; - int log_nat, nat = 0, i, t; - - /* Iterate core threads and update nat */ - for (i = 0, t = cpu; i < NPS_NUM_HW_THREADS; i++, t++) - nat += test_bit(t, cpumask_bits(cpu_possible_mask)); - - log_nat = ilog2(nat); - - udmc.value = read_aux_reg(CTOP_AUX_UDMC); - udmc.nat = log_nat; - write_aux_reg(CTOP_AUX_UDMC, udmc.value); - - mtm_cfg.value = ioread32be(MTM_CFG(cpu)); - mtm_cfg.nat = log_nat; - iowrite32be(mtm_cfg.value, MTM_CFG(cpu)); -} - -static void mtm_init_thread(int cpu) -{ - int i, tries = 5; - struct nps_host_reg_thr_init thr_init; - struct nps_host_reg_thr_init_sts thr_init_sts; - - /* Set thread init register */ - thr_init.value = 0; - iowrite32be(thr_init.value, MTM_THR_INIT(cpu)); - thr_init.thr_id = NPS_CPU_TO_THREAD_NUM(cpu); - thr_init.str = 1; - iowrite32be(thr_init.value, MTM_THR_INIT(cpu)); - - /* Poll till thread init is done */ - for (i = 0; i < tries; i++) { - thr_init_sts.value = ioread32be(MTM_THR_INIT_STS(cpu)); - if (thr_init_sts.thr_id == thr_init.thr_id) { - if (thr_init_sts.bsy) - continue; - else if (thr_init_sts.err) - pr_warn("Failed to thread init cpu %u\n", cpu); - break; - } - - pr_warn("Wrong thread id in thread init for cpu %u\n", cpu); - break; - } - - if (i == tries) - pr_warn("Got thread init timeout for cpu %u\n", cpu); -} - -int mtm_enable_thread(int cpu) -{ - struct nps_host_reg_mtm_cfg mtm_cfg; - - if (NPS_CPU_TO_THREAD_NUM(cpu) == 0) - return 1; - - /* Enable thread in mtm */ - mtm_cfg.value = ioread32be(MTM_CFG(cpu)); - mtm_cfg.ten |= (1 << (NPS_CPU_TO_THREAD_NUM(cpu))); - iowrite32be(mtm_cfg.value, MTM_CFG(cpu)); - - return 0; -} - -void mtm_enable_core(unsigned int cpu) -{ - int i; - struct nps_host_reg_aux_mt_ctrl mt_ctrl; - struct nps_host_reg_mtm_cfg mtm_cfg; - struct nps_host_reg_aux_dpc dpc; - - /* - * Initializing dpc register in each CPU. - * Overwriting the init value of the DPC - * register so that CMEM and FMT virtual address - * spaces are accessible, and Data Plane HW - * facilities are enabled. - */ - dpc.ien = 1; - dpc.men = 1; - write_aux_reg(CTOP_AUX_DPC, dpc.value); - - if (NPS_CPU_TO_THREAD_NUM(cpu) != 0) - return; - - /* Initialize Number of Active Threads */ - mtm_init_nat(cpu); - - /* Initialize mtm_cfg */ - mtm_cfg.value = ioread32be(MTM_CFG(cpu)); - mtm_cfg.ten = 1; - iowrite32be(mtm_cfg.value, MTM_CFG(cpu)); - - /* Initialize all other threads in core */ - for (i = 1; i < NPS_NUM_HW_THREADS; i++) - mtm_init_thread(cpu + i); - - - /* Enable HW schedule, stall counter, mtm */ - mt_ctrl.value = 0; - mt_ctrl.hsen = 1; - mt_ctrl.hs_cnt = mtm_hs_ctr; - mt_ctrl.mten = 1; - write_aux_reg(CTOP_AUX_MT_CTRL, mt_ctrl.value); - - /* - * HW scheduling mechanism will start working - * Only after call to instruction "schd.rw". - * cpu_relax() calls "schd.rw" instruction. - */ - cpu_relax(); -} - -/* Verify and set the value of the mtm hs counter */ -static int __init set_mtm_hs_ctr(char *ctr_str) -{ - int hs_ctr; - int ret; - - ret = kstrtoint(ctr_str, 0, &hs_ctr); - - if (ret || hs_ctr > MT_HS_CNT_MAX || hs_ctr < MT_HS_CNT_MIN) { - pr_err("** Invalid @nps_mtm_hs_ctr [%d] needs to be [%d:%d] (incl)\n", - hs_ctr, MT_HS_CNT_MIN, MT_HS_CNT_MAX); - return -EINVAL; - } - - mtm_hs_ctr = hs_ctr; - - return 0; -} -early_param("nps_mtm_hs_ctr", set_mtm_hs_ctr); diff --git a/arch/arc/plat-eznps/platform.c b/arch/arc/plat-eznps/platform.c deleted file mode 100644 index 6de2fe840043..000000000000 --- a/arch/arc/plat-eznps/platform.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#include <linux/init.h> -#include <linux/io.h> -#include <asm/mach_desc.h> -#include <plat/mtm.h> - -static void __init eznps_configure_msu(void) -{ - int cpu; - struct nps_host_reg_msu_en_cfg msu_en_cfg = {.value = 0}; - - msu_en_cfg.msu_en = 1; - msu_en_cfg.ipi_en = 1; - msu_en_cfg.gim_0_en = 1; - msu_en_cfg.gim_1_en = 1; - - /* enable IPI and GIM messages on all clusters */ - for (cpu = 0 ; cpu < eznps_max_cpus; cpu += eznps_cpus_per_cluster) - iowrite32be(msu_en_cfg.value, - nps_host_reg(cpu, NPS_MSU_BLKID, NPS_MSU_EN_CFG)); -} - -static void __init eznps_configure_gim(void) -{ - u32 reg_value; - u32 gim_int_lines; - struct nps_host_reg_gim_p_int_dst gim_p_int_dst = {.value = 0}; - - gim_int_lines = NPS_GIM_UART_LINE; - gim_int_lines |= NPS_GIM_DBG_LAN_EAST_TX_DONE_LINE; - gim_int_lines |= NPS_GIM_DBG_LAN_EAST_RX_RDY_LINE; - gim_int_lines |= NPS_GIM_DBG_LAN_WEST_TX_DONE_LINE; - gim_int_lines |= NPS_GIM_DBG_LAN_WEST_RX_RDY_LINE; - - /* - * IRQ polarity - * low or high level - * negative or positive edge - */ - reg_value = ioread32be(REG_GIM_P_INT_POL_0); - reg_value &= ~gim_int_lines; - iowrite32be(reg_value, REG_GIM_P_INT_POL_0); - - /* IRQ type level or edge */ - reg_value = ioread32be(REG_GIM_P_INT_SENS_0); - reg_value |= NPS_GIM_DBG_LAN_EAST_TX_DONE_LINE; - reg_value |= NPS_GIM_DBG_LAN_WEST_TX_DONE_LINE; - iowrite32be(reg_value, REG_GIM_P_INT_SENS_0); - - /* - * GIM interrupt select type for - * dbg_lan TX and RX interrupts - * should be type 1 - * type 0 = IRQ line 6 - * type 1 = IRQ line 7 - */ - gim_p_int_dst.is = 1; - iowrite32be(gim_p_int_dst.value, REG_GIM_P_INT_DST_10); - iowrite32be(gim_p_int_dst.value, REG_GIM_P_INT_DST_11); - iowrite32be(gim_p_int_dst.value, REG_GIM_P_INT_DST_25); - iowrite32be(gim_p_int_dst.value, REG_GIM_P_INT_DST_26); - - /* - * CTOP IRQ lines should be defined - * as blocking in GIM - */ - iowrite32be(gim_int_lines, REG_GIM_P_INT_BLK_0); - - /* enable CTOP IRQ lines in GIM */ - iowrite32be(gim_int_lines, REG_GIM_P_INT_EN_0); -} - -static void __init eznps_early_init(void) -{ - eznps_configure_msu(); - eznps_configure_gim(); -} - -static const char *eznps_compat[] __initconst = { - "ezchip,arc-nps", - NULL, -}; - -MACHINE_START(NPS, "nps") - .dt_compat = eznps_compat, - .init_early = eznps_early_init, -MACHINE_END diff --git a/arch/arc/plat-eznps/smp.c b/arch/arc/plat-eznps/smp.c deleted file mode 100644 index f119cb7de2ae..000000000000 --- a/arch/arc/plat-eznps/smp.c +++ /dev/null @@ -1,138 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright(c) 2015 EZchip Technologies. - */ - -#include <linux/smp.h> -#include <linux/of_fdt.h> -#include <linux/io.h> -#include <linux/irqdomain.h> -#include <asm/irq.h> -#include <plat/ctop.h> -#include <plat/smp.h> -#include <plat/mtm.h> - -#define NPS_DEFAULT_MSID 0x34 -#define NPS_MTM_CPU_CFG 0x90 - -static char smp_cpuinfo_buf[128] = {"Extn [EZNPS-SMP]\t: On\n"}; - -/* Get cpu map from device tree */ -static int __init eznps_get_map(const char *name, struct cpumask *cpumask) -{ - unsigned long dt_root = of_get_flat_dt_root(); - const char *buf; - - buf = of_get_flat_dt_prop(dt_root, name, NULL); - if (!buf) - return 1; - - cpulist_parse(buf, cpumask); - - return 0; -} - -/* Update board cpu maps */ -static void __init eznps_init_cpumasks(void) -{ - struct cpumask cpumask; - - if (eznps_get_map("present-cpus", &cpumask)) { - pr_err("Failed to get present-cpus from dtb"); - return; - } - init_cpu_present(&cpumask); - - if (eznps_get_map("possible-cpus", &cpumask)) { - pr_err("Failed to get possible-cpus from dtb"); - return; - } - init_cpu_possible(&cpumask); -} - -static void eznps_init_core(unsigned int cpu) -{ - u32 sync_value; - struct nps_host_reg_aux_hw_comply hw_comply; - struct nps_host_reg_aux_lpc lpc; - - if (NPS_CPU_TO_THREAD_NUM(cpu) != 0) - return; - - hw_comply.value = read_aux_reg(CTOP_AUX_HW_COMPLY); - hw_comply.me = 1; - hw_comply.le = 1; - hw_comply.te = 1; - write_aux_reg(CTOP_AUX_HW_COMPLY, hw_comply.value); - - /* Enable MMU clock */ - lpc.mep = 1; - write_aux_reg(CTOP_AUX_LPC, lpc.value); - - /* Boot CPU only */ - if (!cpu) { - /* Write to general purpose register in CRG */ - sync_value = ioread32be(REG_GEN_PURP_0); - sync_value |= NPS_CRG_SYNC_BIT; - iowrite32be(sync_value, REG_GEN_PURP_0); - } -} - -/* - * Master kick starting another CPU - */ -static void __init eznps_smp_wakeup_cpu(int cpu, unsigned long pc) -{ - struct nps_host_reg_mtm_cpu_cfg cpu_cfg; - - if (mtm_enable_thread(cpu) == 0) - return; - - /* set PC, dmsid, and start CPU */ - cpu_cfg.value = (u32)res_service; - cpu_cfg.dmsid = NPS_DEFAULT_MSID; - cpu_cfg.cs = 1; - iowrite32be(cpu_cfg.value, nps_mtm_reg_addr(cpu, NPS_MTM_CPU_CFG)); -} - -static void eznps_ipi_send(int cpu) -{ - struct global_id gid; - struct { - union { - struct { - u32 num:8, cluster:8, core:8, thread:8; - }; - u32 value; - }; - } ipi; - - gid.value = cpu; - ipi.thread = get_thread(gid); - ipi.core = gid.core; - ipi.cluster = nps_cluster_logic_to_phys(gid.cluster); - ipi.num = NPS_IPI_IRQ; - - __asm__ __volatile__( - " mov r3, %0\n" - " .word %1\n" - : - : "r"(ipi.value), "i"(CTOP_INST_ASRI_0_R3) - : "r3"); -} - -static void eznps_init_per_cpu(int cpu) -{ - smp_ipi_irq_setup(cpu, NPS_IPI_IRQ); - - eznps_init_core(cpu); - mtm_enable_core(cpu); -} - -struct plat_smp_ops plat_smp_ops = { - .info = smp_cpuinfo_buf, - .init_early_smp = eznps_init_cpumasks, - .cpu_kick = eznps_smp_wakeup_cpu, - .ipi_send = eznps_ipi_send, - .init_per_cpu = eznps_init_per_cpu, -}; diff --git a/arch/arc/plat-hsdk/Kconfig b/arch/arc/plat-hsdk/Kconfig index ce8101834518..6b5c54576f54 100644 --- a/arch/arc/plat-hsdk/Kconfig +++ b/arch/arc/plat-hsdk/Kconfig @@ -8,5 +8,6 @@ menuconfig ARC_SOC_HSDK select ARC_HAS_ACCL_REGS select ARC_IRQ_NO_AUTOSAVE select CLK_HSDK + select RESET_CONTROLLER select RESET_HSDK select HAVE_PCI diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 3996b6572c3a..c18fa9d382b7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -120,6 +120,7 @@ config ARM select PCI_SYSCALL if PCI select PERF_USE_VMALLOC select RTC_LIB + select SET_FS select SYS_SUPPORTS_APM_EMULATION # Above selects are sorted alphabetically; please add new ones # according to that. Thanks. diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 80000a66a4e3..87912e5c2256 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -1546,6 +1546,17 @@ config DEBUG_SIRFSOC_UART bool depends on ARCH_SIRF +config DEBUG_UART_FLOW_CONTROL + bool "Enable flow control (CTS) for the debug UART" + depends on DEBUG_LL + default y if ARCH_EBSA110 || DEBUG_FOOTBRIDGE_COM1 || DEBUG_GEMINI || ARCH_RPC + help + Some UART ports are connected to terminals that will use modem + control signals to indicate whether they are ready to receive text. + In practice this means that the terminal is asserting the special + control signal CTS (Clear To Send). If your debug UART supports + this and your debug terminal will require it, enable this option. + config DEBUG_LL_INCLUDE string default "debug/sa1100.S" if DEBUG_SA1100 @@ -1893,11 +1904,6 @@ config DEBUG_UART_8250_PALMCHIP except for having a different register layout. Say Y here if the debug UART is of this type. -config DEBUG_UART_8250_FLOW_CONTROL - bool "Enable flow control for 8250 UART" - depends on DEBUG_LL_UART_8250 || DEBUG_UART_8250 - default y if ARCH_EBSA110 || DEBUG_FOOTBRIDGE_COM1 || DEBUG_GEMINI || ARCH_RPC - config DEBUG_UNCOMPRESS bool "Enable decompressor debugging via DEBUG_LL output" depends on ARCH_MULTIPLATFORM || PLAT_SAMSUNG || ARM_SINGLE_ARMV7M diff --git a/arch/arm/Makefile b/arch/arm/Makefile index e589da3c8949..2874cd9923b7 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -20,10 +20,6 @@ endif # linker. All sections should be explicitly named in the linker script. LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) -ifeq ($(CONFIG_ARM_MODULE_PLTS),y) -KBUILD_LDS_MODULE += $(srctree)/arch/arm/kernel/module.lds -endif - GZFLAGS :=-9 #KBUILD_CFLAGS +=-pipe @@ -143,6 +139,9 @@ head-y := arch/arm/kernel/head$(MMUEXT).o # Text offset. This list is sorted numerically by address in order to # provide a means to avoid/resolve conflicts in multi-arch kernels. +# Note: the 32kB below this value is reserved for use by the kernel +# during boot, and this offset is critical to the functioning of +# kexec-tools. textofs-y := 0x00008000 # We don't want the htc bootloader to corrupt kernel during resume textofs-$(CONFIG_PM_H1940) := 0x00108000 diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 58028abd05d9..47f001ca5499 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -7,11 +7,11 @@ OBJS = -AFLAGS_head.o += -DTEXT_OFFSET=$(TEXT_OFFSET) HEAD = head.o OBJS += misc.o decompress.o ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o +AFLAGS_head.o += -DDEBUG endif FONTC = $(srctree)/lib/fonts/font_acorn_8x8.c @@ -68,7 +68,12 @@ ZTEXTADDR := 0 ZBSSADDR := ALIGN(8) endif +MALLOC_SIZE := 65536 + +AFLAGS_head.o += -DTEXT_OFFSET=$(TEXT_OFFSET) -DMALLOC_SIZE=$(MALLOC_SIZE) CPPFLAGS_vmlinux.lds := -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)" +CPPFLAGS_vmlinux.lds += -DTEXT_OFFSET="$(TEXT_OFFSET)" +CPPFLAGS_vmlinux.lds += -DMALLOC_SIZE="$(MALLOC_SIZE)" compress-$(CONFIG_KERNEL_GZIP) = gzip compress-$(CONFIG_KERNEL_LZO) = lzo diff --git a/arch/arm/boot/compressed/debug.S b/arch/arm/boot/compressed/debug.S index 6bf2917a4621..fac40a717fcf 100644 --- a/arch/arm/boot/compressed/debug.S +++ b/arch/arm/boot/compressed/debug.S @@ -8,7 +8,10 @@ ENTRY(putc) addruart r1, r2, r3 - waituart r3, r1 +#ifdef CONFIG_DEBUG_UART_FLOW_CONTROL + waituartcts r3, r1 +#endif + waituarttxrdy r3, r1 senduart r0, r1 busyuart r3, r1 mov pc, lr diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 434a16982e34..2e04ec5b5446 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -28,19 +28,19 @@ #if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K) || defined(CONFIG_CPU_V7) .macro loadsp, rb, tmp1, tmp2 .endm - .macro writeb, ch, rb + .macro writeb, ch, rb, tmp mcr p14, 0, \ch, c0, c5, 0 .endm #elif defined(CONFIG_CPU_XSCALE) .macro loadsp, rb, tmp1, tmp2 .endm - .macro writeb, ch, rb + .macro writeb, ch, rb, tmp mcr p14, 0, \ch, c8, c0, 0 .endm #else .macro loadsp, rb, tmp1, tmp2 .endm - .macro writeb, ch, rb + .macro writeb, ch, rb, tmp mcr p14, 0, \ch, c1, c0, 0 .endm #endif @@ -49,8 +49,13 @@ #include CONFIG_DEBUG_LL_INCLUDE - .macro writeb, ch, rb + .macro writeb, ch, rb, tmp +#ifdef CONFIG_DEBUG_UART_FLOW_CONTROL + waituartcts \tmp, \rb +#endif + waituarttxrdy \tmp, \rb senduart \ch, \rb + busyuart \tmp, \rb .endm #if defined(CONFIG_ARCH_SA1100) @@ -81,42 +86,11 @@ bl phex .endm - .macro debug_reloc_start -#ifdef DEBUG - kputc #'\n' - kphex r6, 8 /* processor id */ - kputc #':' - kphex r7, 8 /* architecture id */ -#ifdef CONFIG_CPU_CP15 - kputc #':' - mrc p15, 0, r0, c1, c0 - kphex r0, 8 /* control reg */ -#endif - kputc #'\n' - kphex r5, 8 /* decompressed kernel start */ - kputc #'-' - kphex r9, 8 /* decompressed kernel end */ - kputc #'>' - kphex r4, 8 /* kernel execution address */ - kputc #'\n' -#endif - .endm - - .macro debug_reloc_end -#ifdef DEBUG - kphex r5, 8 /* end of kernel */ - kputc #'\n' - mov r0, r4 - bl memdump /* dump 256 bytes at start of kernel */ -#endif - .endm - /* * Debug kernel copy by printing the memory addresses involved */ .macro dbgkc, begin, end, cbegin, cend #ifdef DEBUG - kputc #'\n' kputc #'C' kputc #':' kputc #'0' @@ -136,7 +110,28 @@ kputc #'x' kphex \cend, 8 /* End of kernel copy */ kputc #'\n' - kputc #'\r' +#endif + .endm + + /* + * Debug print of the final appended DTB location + */ + .macro dbgadtb, begin, end +#ifdef DEBUG + kputc #'D' + kputc #'T' + kputc #'B' + kputc #':' + kputc #'0' + kputc #'x' + kphex \begin, 8 /* Start of appended DTB */ + kputc #' ' + kputc #'(' + kputc #'0' + kputc #'x' + kphex \end, 8 /* End of appended DTB */ + kputc #')' + kputc #'\n' #endif .endm @@ -303,7 +298,7 @@ restart: adr r0, LC1 #ifndef CONFIG_ZBOOT_ROM /* malloc space is above the relocated stack (64k max) */ - add r10, sp, #0x10000 + add r10, sp, #MALLOC_SIZE #else /* * With ZBOOT_ROM the bss/stack is non relocatable, @@ -357,6 +352,7 @@ restart: adr r0, LC1 mov r5, r5, ror #8 eor r5, r5, r1, lsr #8 #endif + dbgadtb r6, r5 /* 50% DTB growth should be good enough */ add r5, r5, r5, lsr #1 /* preserve 64-bit alignment */ @@ -614,7 +610,7 @@ not_relocated: mov r0, #0 */ mov r0, r4 mov r1, sp @ malloc space above stack - add r2, sp, #0x10000 @ 64k max + add r2, sp, #MALLOC_SIZE @ 64k max mov r3, r7 bl decompress_kernel @@ -1356,7 +1352,7 @@ puts: loadsp r3, r2, r1 1: ldrb r2, [r0], #1 teq r2, #0 moveq pc, lr -2: writeb r2, r3 +2: writeb r2, r3, r1 mov r1, #0x00020000 3: subs r1, r1, #1 bne 3b diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S index b914be3a207b..1bcb68ac4b01 100644 --- a/arch/arm/boot/compressed/vmlinux.lds.S +++ b/arch/arm/boot/compressed/vmlinux.lds.S @@ -44,10 +44,12 @@ SECTIONS } .table : ALIGN(4) { _table_start = .; - LONG(ZIMAGE_MAGIC(4)) + LONG(ZIMAGE_MAGIC(6)) LONG(ZIMAGE_MAGIC(0x5a534c4b)) LONG(ZIMAGE_MAGIC(__piggy_size_addr - _start)) LONG(ZIMAGE_MAGIC(_kernel_bss_size)) + LONG(ZIMAGE_MAGIC(TEXT_OFFSET)) + LONG(ZIMAGE_MAGIC(MALLOC_SIZE)) LONG(0) _table_end = .; } diff --git a/arch/arm/boot/dts/at91-sam9x60ek.dts b/arch/arm/boot/dts/at91-sam9x60ek.dts index ca15ff8fea18..eae28b82c7fd 100644 --- a/arch/arm/boot/dts/at91-sam9x60ek.dts +++ b/arch/arm/boot/dts/at91-sam9x60ek.dts @@ -563,6 +563,12 @@ atmel,pins = <AT91_PIOD 18 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>; }; }; + + usb0 { + pinctrl_usba_vbus: usba_vbus { + atmel,pins = <AT91_PIOB 16 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>; + }; + }; }; /* pinctrl */ &pmc { @@ -666,6 +672,13 @@ }; }; +&usb0 { + atmel,vbus-gpio = <&pioB 16 GPIO_ACTIVE_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_usba_vbus>; + status = "okay"; +}; + &usb1 { num-ports = <3>; atmel,vbus-gpio = <0 diff --git a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts index 222d7825e1ab..e94244a215af 100644 --- a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts +++ b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts @@ -4,6 +4,8 @@ #include "bcm2835-rpi.dtsi" #include "bcm283x-rpi-usb-peripheral.dtsi" +#include <dt-bindings/reset/raspberrypi,firmware-reset.h> + / { compatible = "raspberrypi,4-model-b", "brcm,bcm2711"; model = "Raspberry Pi 4 Model B"; @@ -88,6 +90,11 @@ ""; status = "okay"; }; + + reset: reset { + compatible = "raspberrypi,firmware-reset"; + #reset-cells = <1>; + }; }; &gpio { @@ -207,6 +214,21 @@ }; }; +&pcie0 { + pci@1,0 { + #address-cells = <3>; + #size-cells = <2>; + ranges; + + reg = <0 0 0 0 0>; + + usb@1,0 { + reg = <0x10000 0 0 0 0>; + resets = <&reset RASPBERRYPI_FIRMWARE_RESET_ID_USB>; + }; + }; +}; + /* uart0 communicates with the BT module */ &uart0 { pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/cros-ec-keyboard.dtsi b/arch/arm/boot/dts/cros-ec-keyboard.dtsi index 4a0c1037fbc0..165c5bcd510e 100644 --- a/arch/arm/boot/dts/cros-ec-keyboard.dtsi +++ b/arch/arm/boot/dts/cros-ec-keyboard.dtsi @@ -46,6 +46,7 @@ MATRIX_KEY(0x02, 0x09, KEY_F8) MATRIX_KEY(0x02, 0x0a, KEY_YEN) + MATRIX_KEY(0x03, 0x00, KEY_LEFTMETA) MATRIX_KEY(0x03, 0x01, KEY_GRAVE) MATRIX_KEY(0x03, 0x02, KEY_F2) MATRIX_KEY(0x03, 0x03, KEY_5) diff --git a/arch/arm/boot/dts/r8a7742.dtsi b/arch/arm/boot/dts/r8a7742.dtsi index 9743b4242801..0240d017c90d 100644 --- a/arch/arm/boot/dts/r8a7742.dtsi +++ b/arch/arm/boot/dts/r8a7742.dtsi @@ -386,6 +386,54 @@ #thermal-sensor-cells = <0>; }; + ipmmu_sy0: iommu@e6280000 { + compatible = "renesas,ipmmu-r8a7742", + "renesas,ipmmu-vmsa"; + reg = <0 0xe6280000 0 0x1000>; + interrupts = <GIC_SPI 223 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 224 IRQ_TYPE_LEVEL_HIGH>; + #iommu-cells = <1>; + status = "disabled"; + }; + + ipmmu_sy1: iommu@e6290000 { + compatible = "renesas,ipmmu-r8a7742", + "renesas,ipmmu-vmsa"; + reg = <0 0xe6290000 0 0x1000>; + interrupts = <GIC_SPI 225 IRQ_TYPE_LEVEL_HIGH>; + #iommu-cells = <1>; + status = "disabled"; + }; + + ipmmu_ds: iommu@e6740000 { + compatible = "renesas,ipmmu-r8a7742", + "renesas,ipmmu-vmsa"; + reg = <0 0xe6740000 0 0x1000>; + interrupts = <GIC_SPI 198 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 199 IRQ_TYPE_LEVEL_HIGH>; + #iommu-cells = <1>; + status = "disabled"; + }; + + ipmmu_mp: iommu@ec680000 { + compatible = "renesas,ipmmu-r8a7742", + "renesas,ipmmu-vmsa"; + reg = <0 0xec680000 0 0x1000>; + interrupts = <GIC_SPI 226 IRQ_TYPE_LEVEL_HIGH>; + #iommu-cells = <1>; + status = "disabled"; + }; + + ipmmu_mx: iommu@fe951000 { + compatible = "renesas,ipmmu-r8a7742", + "renesas,ipmmu-vmsa"; + reg = <0 0xfe951000 0 0x1000>; + interrupts = <GIC_SPI 222 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 221 IRQ_TYPE_LEVEL_HIGH>; + #iommu-cells = <1>; + status = "disabled"; + }; + icram0: sram@e63a0000 { compatible = "mmio-sram"; reg = <0 0xe63a0000 0 0x12000>; diff --git a/arch/arm/boot/dts/sam9x60.dtsi b/arch/arm/boot/dts/sam9x60.dtsi index d10843da4a85..42f76212d472 100644 --- a/arch/arm/boot/dts/sam9x60.dtsi +++ b/arch/arm/boot/dts/sam9x60.dtsi @@ -69,6 +69,20 @@ #size-cells = <1>; ranges; + usb0: gadget@500000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "microchip,sam9x60-udc"; + reg = <0x00500000 0x100000 + 0xf803c000 0x400>; + interrupts = <23 IRQ_TYPE_LEVEL_HIGH 2>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 23>, <&pmc PMC_TYPE_CORE PMC_UTMI>; + clock-names = "pclk", "hclk"; + assigned-clocks = <&pmc PMC_TYPE_CORE PMC_UTMI>; + assigned-clock-rates = <480000000>; + status = "disabled"; + }; + usb1: ohci@600000 { compatible = "atmel,at91rm9200-ohci", "usb-ohci"; reg = <0x00600000 0x100000>; diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c index f4b719bde763..7996c04393d5 100644 --- a/arch/arm/common/dmabounce.c +++ b/arch/arm/common/dmabounce.c @@ -24,7 +24,8 @@ #include <linux/slab.h> #include <linux/page-flags.h> #include <linux/device.h> -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/dmapool.h> #include <linux/list.h> #include <linux/scatterlist.h> diff --git a/arch/arm/include/asm/dma-contiguous.h b/arch/arm/include/asm/dma-contiguous.h deleted file mode 100644 index d785187a6f8a..000000000000 --- a/arch/arm/include/asm/dma-contiguous.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASMARM_DMA_CONTIGUOUS_H -#define ASMARM_DMA_CONTIGUOUS_H - -#ifdef __KERNEL__ -#ifdef CONFIG_DMA_CMA - -#include <linux/types.h> - -void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size); - -#endif -#endif - -#endif diff --git a/arch/arm/include/asm/dma-direct.h b/arch/arm/include/asm/dma-direct.h index 7c3001a6a775..77fcb7ee5ec9 100644 --- a/arch/arm/include/asm/dma-direct.h +++ b/arch/arm/include/asm/dma-direct.h @@ -2,13 +2,44 @@ #ifndef ASM_ARM_DMA_DIRECT_H #define ASM_ARM_DMA_DIRECT_H 1 -static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +#include <asm/memory.h> + +/* + * dma_to_pfn/pfn_to_dma/virt_to_dma are architecture private + * functions used internally by the DMA-mapping API to provide DMA + * addresses. They must not be used by drivers. + */ +static inline dma_addr_t pfn_to_dma(struct device *dev, unsigned long pfn) +{ + if (dev && dev->dma_range_map) + pfn = PFN_DOWN(translate_phys_to_dma(dev, PFN_PHYS(pfn))); + return (dma_addr_t)__pfn_to_bus(pfn); +} + +static inline unsigned long dma_to_pfn(struct device *dev, dma_addr_t addr) +{ + unsigned long pfn = __bus_to_pfn(addr); + + if (dev && dev->dma_range_map) + pfn = PFN_DOWN(translate_dma_to_phys(dev, PFN_PHYS(pfn))); + return pfn; +} + +static inline dma_addr_t virt_to_dma(struct device *dev, void *addr) +{ + if (dev) + return pfn_to_dma(dev, virt_to_pfn(addr)); + + return (dma_addr_t)__virt_to_bus((unsigned long)(addr)); +} + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { unsigned int offset = paddr & ~PAGE_MASK; return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset; } -static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr) +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr) { unsigned int offset = dev_addr & ~PAGE_MASK; return __pfn_to_phys(dma_to_pfn(dev, dev_addr)) + offset; diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h index 86405cc81385..fe9ef6f79e9c 100644 --- a/arch/arm/include/asm/dma-iommu.h +++ b/arch/arm/include/asm/dma-iommu.h @@ -6,7 +6,6 @@ #include <linux/mm_types.h> #include <linux/scatterlist.h> -#include <linux/dma-debug.h> #include <linux/kref.h> struct dma_iommu_mapping { diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index bdd80ddbca34..77082246a5e1 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -6,9 +6,6 @@ #include <linux/mm_types.h> #include <linux/scatterlist.h> -#include <linux/dma-debug.h> - -#include <asm/memory.h> #include <xen/xen.h> #include <asm/xen/hypervisor.h> @@ -23,74 +20,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return NULL; } -#ifdef __arch_page_to_dma -#error Please update to __arch_pfn_to_dma -#endif - -/* - * dma_to_pfn/pfn_to_dma/dma_to_virt/virt_to_dma are architecture private - * functions used internally by the DMA-mapping API to provide DMA - * addresses. They must not be used by drivers. - */ -#ifndef __arch_pfn_to_dma -static inline dma_addr_t pfn_to_dma(struct device *dev, unsigned long pfn) -{ - if (dev) - pfn -= dev->dma_pfn_offset; - return (dma_addr_t)__pfn_to_bus(pfn); -} - -static inline unsigned long dma_to_pfn(struct device *dev, dma_addr_t addr) -{ - unsigned long pfn = __bus_to_pfn(addr); - - if (dev) - pfn += dev->dma_pfn_offset; - - return pfn; -} - -static inline void *dma_to_virt(struct device *dev, dma_addr_t addr) -{ - if (dev) { - unsigned long pfn = dma_to_pfn(dev, addr); - - return phys_to_virt(__pfn_to_phys(pfn)); - } - - return (void *)__bus_to_virt((unsigned long)addr); -} - -static inline dma_addr_t virt_to_dma(struct device *dev, void *addr) -{ - if (dev) - return pfn_to_dma(dev, virt_to_pfn(addr)); - - return (dma_addr_t)__virt_to_bus((unsigned long)(addr)); -} - -#else -static inline dma_addr_t pfn_to_dma(struct device *dev, unsigned long pfn) -{ - return __arch_pfn_to_dma(dev, pfn); -} - -static inline unsigned long dma_to_pfn(struct device *dev, dma_addr_t addr) -{ - return __arch_dma_to_pfn(dev, addr); -} - -static inline void *dma_to_virt(struct device *dev, dma_addr_t addr) -{ - return __arch_dma_to_virt(dev, addr); -} - -static inline dma_addr_t virt_to_dma(struct device *dev, void *addr) -{ - return __arch_virt_to_dma(dev, addr); -} -#endif - /** * arm_dma_alloc - allocate consistent memory for DMA * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices diff --git a/arch/arm/include/asm/mach/pci.h b/arch/arm/include/asm/mach/pci.h index 83d340702680..ea9bd08895b7 100644 --- a/arch/arm/include/asm/mach/pci.h +++ b/arch/arm/include/asm/mach/pci.h @@ -17,10 +17,8 @@ struct pci_host_bridge; struct device; struct hw_pci { - struct msi_controller *msi_ctrl; struct pci_ops *ops; int nr_controllers; - unsigned int io_optional:1; void **private_data; int (*setup)(int nr, struct pci_sys_data *); int (*scan)(int nr, struct pci_host_bridge *); @@ -28,11 +26,6 @@ struct hw_pci { void (*postinit)(void); u8 (*swizzle)(struct pci_dev *dev, u8 *pin); int (*map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); - resource_size_t (*align_resource)(struct pci_dev *dev, - const struct resource *res, - resource_size_t start, - resource_size_t size, - resource_size_t align); }; /* diff --git a/arch/arm/kernel/module.lds b/arch/arm/include/asm/module.lds.h index 79cb6af565e5..0e7cb4e314b4 100644 --- a/arch/arm/kernel/module.lds +++ b/arch/arm/include/asm/module.lds.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_ARM_MODULE_PLTS SECTIONS { .plt : { BYTE(0) } .init.plt : { BYTE(0) } } +#endif diff --git a/arch/arm/include/debug/8250.S b/arch/arm/include/debug/8250.S index e4a036f082c2..e3692a37cede 100644 --- a/arch/arm/include/debug/8250.S +++ b/arch/arm/include/debug/8250.S @@ -45,10 +45,11 @@ bne 1002b .endm - .macro waituart,rd,rx -#ifdef CONFIG_DEBUG_UART_8250_FLOW_CONTROL + .macro waituarttxrdy,rd,rx + .endm + + .macro waituartcts,rd,rx 1001: load \rd, [\rx, #UART_MSR << UART_SHIFT] tst \rd, #UART_MSR_CTS beq 1001b -#endif .endm diff --git a/arch/arm/include/debug/asm9260.S b/arch/arm/include/debug/asm9260.S index 0da1eb625331..5a0ce145c44a 100644 --- a/arch/arm/include/debug/asm9260.S +++ b/arch/arm/include/debug/asm9260.S @@ -11,7 +11,10 @@ ldr \rv, = CONFIG_DEBUG_UART_VIRT .endm - .macro waituart,rd,rx + .macro waituarttxrdy,rd,rx + .endm + + .macro waituartcts,rd,rx .endm .macro senduart,rd,rx diff --git a/arch/arm/include/debug/at91.S b/arch/arm/include/debug/at91.S index 6c91cbaaa20b..17722824e2f2 100644 --- a/arch/arm/include/debug/at91.S +++ b/arch/arm/include/debug/at91.S @@ -19,12 +19,15 @@ strb \rd, [\rx, #(AT91_DBGU_THR)] @ Write to Transmitter Holding Register .endm - .macro waituart,rd,rx + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #(AT91_DBGU_SR)] @ Read Status Register tst \rd, #AT91_DBGU_TXRDY @ DBGU_TXRDY = 1 when ready to transmit beq 1001b .endm + .macro waituartcts,rd,rx + .endm + .macro busyuart,rd,rx 1001: ldr \rd, [\rx, #(AT91_DBGU_SR)] @ Read Status Register tst \rd, #AT91_DBGU_TXEMPTY @ DBGU_TXEMPTY = 1 when transmission complete diff --git a/arch/arm/include/debug/bcm63xx.S b/arch/arm/include/debug/bcm63xx.S index 06a896227396..da65abb6738d 100644 --- a/arch/arm/include/debug/bcm63xx.S +++ b/arch/arm/include/debug/bcm63xx.S @@ -17,12 +17,15 @@ strb \rd, [\rx, #UART_FIFO_REG] .endm - .macro waituart, rd, rx + .macro waituarttxrdy, rd, rx 1001: ldr \rd, [\rx, #UART_IR_REG] tst \rd, #(1 << UART_IR_TXEMPTY) beq 1001b .endm + .macro waituartcts, rd, rx + .endm + .macro busyuart, rd, rx 1002: ldr \rd, [\rx, #UART_IR_REG] tst \rd, #(1 << UART_IR_TXTRESH) diff --git a/arch/arm/include/debug/brcmstb.S b/arch/arm/include/debug/brcmstb.S index 132a20c4a676..7ffe66993029 100644 --- a/arch/arm/include/debug/brcmstb.S +++ b/arch/arm/include/debug/brcmstb.S @@ -142,7 +142,10 @@ ARM_BE8( rev \rd, \rd ) bne 1002b .endm - .macro waituart,rd,rx + .macro waituarttxrdy,rd,rx + .endm + + .macro waituartcts,rd,rx .endm /* diff --git a/arch/arm/include/debug/clps711x.S b/arch/arm/include/debug/clps711x.S index 774a67ac3877..a983d12a6515 100644 --- a/arch/arm/include/debug/clps711x.S +++ b/arch/arm/include/debug/clps711x.S @@ -20,7 +20,10 @@ ldr \rp, =CLPS711X_UART_PADDR .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm .macro senduart,rd,rx diff --git a/arch/arm/include/debug/dc21285.S b/arch/arm/include/debug/dc21285.S index d7e8c71706ab..4ec0e5e31704 100644 --- a/arch/arm/include/debug/dc21285.S +++ b/arch/arm/include/debug/dc21285.S @@ -34,5 +34,8 @@ bne 1001b .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm diff --git a/arch/arm/include/debug/digicolor.S b/arch/arm/include/debug/digicolor.S index 256f5f4da275..443674cad76a 100644 --- a/arch/arm/include/debug/digicolor.S +++ b/arch/arm/include/debug/digicolor.S @@ -21,7 +21,10 @@ strb \rd, [\rx, #UA0_EMI_REC] .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm .macro busyuart,rd,rx diff --git a/arch/arm/include/debug/efm32.S b/arch/arm/include/debug/efm32.S index 5ed5028306f4..b0083d6e31e8 100644 --- a/arch/arm/include/debug/efm32.S +++ b/arch/arm/include/debug/efm32.S @@ -29,7 +29,10 @@ strb \rd, [\rx, #UARTn_TXDATA] .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #UARTn_STATUS] tst \rd, #UARTn_STATUS_TXBL beq 1001b diff --git a/arch/arm/include/debug/icedcc.S b/arch/arm/include/debug/icedcc.S index 74a0dd036a17..d5e65da8a687 100644 --- a/arch/arm/include/debug/icedcc.S +++ b/arch/arm/include/debug/icedcc.S @@ -23,7 +23,10 @@ beq 1001b .endm - .macro waituart, rd, rx + .macro waituartcts, rd, rx + .endm + + .macro waituarttxrdy, rd, rx mov \rd, #0x2000000 1001: subs \rd, \rd, #1 @@ -47,7 +50,10 @@ beq 1001b .endm - .macro waituart, rd, rx + .macro waituartcts, rd, rx + .endm + + .macro waituarttxrdy, rd, rx mov \rd, #0x10000000 1001: subs \rd, \rd, #1 @@ -72,7 +78,10 @@ .endm - .macro waituart, rd, rx + .macro waituartcts, rd, rx + .endm + + .macro waituarttxrdy, rd, rx mov \rd, #0x2000000 1001: subs \rd, \rd, #1 diff --git a/arch/arm/include/debug/imx.S b/arch/arm/include/debug/imx.S index 1c1b9d1da4c8..bb7b9550580c 100644 --- a/arch/arm/include/debug/imx.S +++ b/arch/arm/include/debug/imx.S @@ -35,7 +35,10 @@ str \rd, [\rx, #0x40] @ TXDATA .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm .macro busyuart,rd,rx diff --git a/arch/arm/include/debug/meson.S b/arch/arm/include/debug/meson.S index 1e501a0054ae..7b60e4401225 100644 --- a/arch/arm/include/debug/meson.S +++ b/arch/arm/include/debug/meson.S @@ -25,7 +25,10 @@ beq 1002b .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #MESON_AO_UART_STATUS] tst \rd, #MESON_AO_UART_TX_FIFO_FULL bne 1001b diff --git a/arch/arm/include/debug/msm.S b/arch/arm/include/debug/msm.S index 9405b71461da..530edc74f9a3 100644 --- a/arch/arm/include/debug/msm.S +++ b/arch/arm/include/debug/msm.S @@ -17,7 +17,10 @@ ARM_BE8(rev \rd, \rd ) str \rd, [\rx, #0x70] .endm - .macro waituart, rd, rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy, rd, rx @ check for TX_EMT in UARTDM_SR ldr \rd, [\rx, #0x08] ARM_BE8(rev \rd, \rd ) diff --git a/arch/arm/include/debug/omap2plus.S b/arch/arm/include/debug/omap2plus.S index b5696a33ba0f..0680be6c79d3 100644 --- a/arch/arm/include/debug/omap2plus.S +++ b/arch/arm/include/debug/omap2plus.S @@ -75,5 +75,8 @@ omap_uart_lsr: .word 0 bne 1001b .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm diff --git a/arch/arm/include/debug/pl01x.S b/arch/arm/include/debug/pl01x.S index a2a553afe7b8..0c7bfa4c10db 100644 --- a/arch/arm/include/debug/pl01x.S +++ b/arch/arm/include/debug/pl01x.S @@ -26,7 +26,10 @@ strb \rd, [\rx, #UART01x_DR] .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #UART01x_FR] ARM_BE8( rev \rd, \rd ) tst \rd, #UART01x_FR_TXFF diff --git a/arch/arm/include/debug/renesas-scif.S b/arch/arm/include/debug/renesas-scif.S index 25f06663a9a4..8e433e981bbe 100644 --- a/arch/arm/include/debug/renesas-scif.S +++ b/arch/arm/include/debug/renesas-scif.S @@ -33,7 +33,10 @@ ldr \rv, =SCIF_VIRT .endm - .macro waituart, rd, rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy, rd, rx 1001: ldrh \rd, [\rx, #FSR] tst \rd, #TDFE beq 1001b diff --git a/arch/arm/include/debug/sa1100.S b/arch/arm/include/debug/sa1100.S index 6109e6058e5b..7968ea52df3d 100644 --- a/arch/arm/include/debug/sa1100.S +++ b/arch/arm/include/debug/sa1100.S @@ -51,7 +51,10 @@ str \rd, [\rx, #UTDR] .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #UTSR1] tst \rd, #UTSR1_TNF beq 1001b diff --git a/arch/arm/include/debug/samsung.S b/arch/arm/include/debug/samsung.S index 69201d7fb48f..ab474d564a90 100644 --- a/arch/arm/include/debug/samsung.S +++ b/arch/arm/include/debug/samsung.S @@ -69,7 +69,10 @@ ARM_BE8(rev \rd, \rd) 1002: @ exit busyuart .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx ldr \rd, [\rx, # S3C2410_UFCON] ARM_BE8(rev \rd, \rd) tst \rd, #S3C2410_UFCON_FIFOMODE @ fifo enabled? diff --git a/arch/arm/include/debug/sirf.S b/arch/arm/include/debug/sirf.S index e73e4de0a015..3612c7b9cbe7 100644 --- a/arch/arm/include/debug/sirf.S +++ b/arch/arm/include/debug/sirf.S @@ -29,7 +29,10 @@ .macro busyuart,rd,rx .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #SIRF_LLUART_TXFIFO_STATUS] tst \rd, #SIRF_LLUART_TXFIFO_EMPTY beq 1001b diff --git a/arch/arm/include/debug/sti.S b/arch/arm/include/debug/sti.S index 6b42c91f217d..72d052511890 100644 --- a/arch/arm/include/debug/sti.S +++ b/arch/arm/include/debug/sti.S @@ -45,7 +45,10 @@ strb \rd, [\rx, #ASC_TX_BUF_OFF] .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #ASC_STA_OFF] tst \rd, #ASC_STA_TX_FULL bne 1001b diff --git a/arch/arm/include/debug/stm32.S b/arch/arm/include/debug/stm32.S index f3c4a37210ed..b6d9df30e37d 100644 --- a/arch/arm/include/debug/stm32.S +++ b/arch/arm/include/debug/stm32.S @@ -27,7 +27,10 @@ strb \rd, [\rx, #STM32_USART_TDR_OFF] .endm -.macro waituart,rd,rx +.macro waituartcts,rd,rx +.endm + +.macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #(STM32_USART_SR_OFF)] @ Read Status Register tst \rd, #STM32_USART_TXE @ TXE = 1 = tx empty beq 1001b diff --git a/arch/arm/include/debug/tegra.S b/arch/arm/include/debug/tegra.S index 2148d0f88591..98daa7f48314 100644 --- a/arch/arm/include/debug/tegra.S +++ b/arch/arm/include/debug/tegra.S @@ -178,15 +178,16 @@ 1002: .endm - .macro waituart, rd, rx -#ifdef FLOW_CONTROL + .macro waituartcts, rd, rx cmp \rx, #0 beq 1002f 1001: ldrb \rd, [\rx, #UART_MSR << UART_SHIFT] tst \rd, #UART_MSR_CTS beq 1001b 1002: -#endif + .endm + + .macro waituarttxrdy,rd,rx .endm /* diff --git a/arch/arm/include/debug/vf.S b/arch/arm/include/debug/vf.S index 854d9bd82770..035bcbf117ab 100644 --- a/arch/arm/include/debug/vf.S +++ b/arch/arm/include/debug/vf.S @@ -29,5 +29,8 @@ beq 1001b @ wait until transmit done .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm diff --git a/arch/arm/include/debug/vt8500.S b/arch/arm/include/debug/vt8500.S index 8dc1df2d91b8..d01094fdbc8c 100644 --- a/arch/arm/include/debug/vt8500.S +++ b/arch/arm/include/debug/vt8500.S @@ -28,7 +28,10 @@ bne 1001b .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx .endm #endif diff --git a/arch/arm/include/debug/zynq.S b/arch/arm/include/debug/zynq.S index 58d77c972fd6..5d42cc35ecf3 100644 --- a/arch/arm/include/debug/zynq.S +++ b/arch/arm/include/debug/zynq.S @@ -33,7 +33,10 @@ strb \rd, [\rx, #UART_FIFO_OFFSET] @ TXDATA .endm - .macro waituart,rd,rx + .macro waituartcts,rd,rx + .endm + + .macro waituarttxrdy,rd,rx 1001: ldr \rd, [\rx, #UART_SR_OFFSET] ARM_BE8( rev \rd, \rd ) tst \rd, #UART_SR_TXEMPTY diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c index eecec16aa708..e7ef2b5bea9c 100644 --- a/arch/arm/kernel/bios32.c +++ b/arch/arm/kernel/bios32.c @@ -394,8 +394,7 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) return irq; } -static int pcibios_init_resource(int busnr, struct pci_sys_data *sys, - int io_optional) +static int pcibios_init_resource(int busnr, struct pci_sys_data *sys) { int ret; struct resource_entry *window; @@ -405,14 +404,6 @@ static int pcibios_init_resource(int busnr, struct pci_sys_data *sys, &iomem_resource, sys->mem_offset); } - /* - * If a platform says I/O port support is optional, we don't add - * the default I/O space. The platform is responsible for adding - * any I/O space it needs. - */ - if (io_optional) - return 0; - resource_list_for_each_entry(window, &sys->resources) if (resource_type(window->res) == IORESOURCE_IO) return 0; @@ -462,7 +453,7 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw, if (ret > 0) { - ret = pcibios_init_resource(nr, sys, hw->io_optional); + ret = pcibios_init_resource(nr, sys); if (ret) { pci_free_host_bridge(bridge); break; @@ -480,9 +471,6 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw, bridge->sysdata = sys; bridge->busnr = sys->busnr; bridge->ops = hw->ops; - bridge->msi = hw->msi_ctrl; - bridge->align_resource = - hw->align_resource; ret = pci_scan_root_bus_bridge(bridge); } diff --git a/arch/arm/kernel/debug.S b/arch/arm/kernel/debug.S index e112072b579d..d92f44bdf438 100644 --- a/arch/arm/kernel/debug.S +++ b/arch/arm/kernel/debug.S @@ -89,11 +89,18 @@ ENTRY(printascii) 2: teq r1, #'\n' bne 3f mov r1, #'\r' - waituart r2, r3 +#ifdef CONFIG_DEBUG_UART_FLOW_CONTROL + waituartcts r2, r3 +#endif + waituarttxrdy r2, r3 senduart r1, r3 busyuart r2, r3 mov r1, #'\n' -3: waituart r2, r3 +3: +#ifdef CONFIG_DEBUG_UART_FLOW_CONTROL + waituartcts r2, r3 +#endif + waituarttxrdy r2, r3 senduart r1, r3 busyuart r2, r3 b 1b diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 7a4853b1213a..08660ae9dcbc 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -683,6 +683,40 @@ static void disable_single_step(struct perf_event *bp) arch_install_hw_breakpoint(bp); } +/* + * Arm32 hardware does not always report a watchpoint hit address that matches + * one of the watchpoints set. It can also report an address "near" the + * watchpoint if a single instruction access both watched and unwatched + * addresses. There is no straight-forward way, short of disassembling the + * offending instruction, to map that address back to the watchpoint. This + * function computes the distance of the memory access from the watchpoint as a + * heuristic for the likelyhood that a given access triggered the watchpoint. + * + * See this same function in the arm64 platform code, which has the same + * problem. + * + * The function returns the distance of the address from the bytes watched by + * the watchpoint. In case of an exact match, it returns 0. + */ +static u32 get_distance_from_watchpoint(unsigned long addr, u32 val, + struct arch_hw_breakpoint_ctrl *ctrl) +{ + u32 wp_low, wp_high; + u32 lens, lene; + + lens = __ffs(ctrl->len); + lene = __fls(ctrl->len); + + wp_low = val + lens; + wp_high = val + lene; + if (addr < wp_low) + return wp_low - addr; + else if (addr > wp_high) + return addr - wp_high; + else + return 0; +} + static int watchpoint_fault_on_uaccess(struct pt_regs *regs, struct arch_hw_breakpoint *info) { @@ -692,23 +726,25 @@ static int watchpoint_fault_on_uaccess(struct pt_regs *regs, static void watchpoint_handler(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { - int i, access; - u32 val, ctrl_reg, alignment_mask; + int i, access, closest_match = 0; + u32 min_dist = -1, dist; + u32 val, ctrl_reg; struct perf_event *wp, **slots; struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; slots = this_cpu_ptr(wp_on_reg); + /* + * Find all watchpoints that match the reported address. If no exact + * match is found. Attribute the hit to the closest watchpoint. + */ + rcu_read_lock(); for (i = 0; i < core_num_wrps; ++i) { - rcu_read_lock(); - wp = slots[i]; - if (wp == NULL) - goto unlock; + continue; - info = counter_arch_bp(wp); /* * The DFAR is an unknown value on debug architectures prior * to 7.1. Since we only allow a single watchpoint on these @@ -717,33 +753,31 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, */ if (debug_arch < ARM_DEBUG_ARCH_V7_1) { BUG_ON(i > 0); + info = counter_arch_bp(wp); info->trigger = wp->attr.bp_addr; } else { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) - alignment_mask = 0x7; - else - alignment_mask = 0x3; - - /* Check if the watchpoint value matches. */ - val = read_wb_reg(ARM_BASE_WVR + i); - if (val != (addr & ~alignment_mask)) - goto unlock; - - /* Possible match, check the byte address select. */ - ctrl_reg = read_wb_reg(ARM_BASE_WCR + i); - decode_ctrl_reg(ctrl_reg, &ctrl); - if (!((1 << (addr & alignment_mask)) & ctrl.len)) - goto unlock; - /* Check that the access type matches. */ if (debug_exception_updates_fsr()) { access = (fsr & ARM_FSR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) - goto unlock; + continue; } + val = read_wb_reg(ARM_BASE_WVR + i); + ctrl_reg = read_wb_reg(ARM_BASE_WCR + i); + decode_ctrl_reg(ctrl_reg, &ctrl); + dist = get_distance_from_watchpoint(addr, val, &ctrl); + if (dist < min_dist) { + min_dist = dist; + closest_match = i; + } + /* Is this an exact match? */ + if (dist != 0) + continue; + /* We have a winner. */ + info = counter_arch_bp(wp); info->trigger = addr; } @@ -765,13 +799,23 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, * we can single-step over the watchpoint trigger. */ if (!is_default_overflow_handler(wp)) - goto unlock; - + continue; step: enable_single_step(wp, instruction_pointer(regs)); -unlock: - rcu_read_unlock(); } + + if (min_dist > 0 && min_dist != -1) { + /* No exact match found. */ + wp = slots[closest_match]; + info = counter_arch_bp(wp); + info->trigger = addr; + pr_debug("watchpoint fired: address = 0x%x\n", info->trigger); + perf_bp_event(wp, regs); + if (is_default_overflow_handler(wp)) + enable_single_step(wp, instruction_pointer(regs)); + } + + rcu_read_unlock(); } static void watchpoint_single_step_handler(unsigned long pc) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index c1892f733f20..585edbfccf6d 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -669,7 +669,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) } else if (thread_flags & _TIF_UPROBE) { uprobe_notify_resume(regs); } else { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/arm/mach-davinci/board-da830-evm.c b/arch/arm/mach-davinci/board-da830-evm.c index 1076886938b6..a20ba12d876c 100644 --- a/arch/arm/mach-davinci/board-da830-evm.c +++ b/arch/arm/mach-davinci/board-da830-evm.c @@ -306,7 +306,7 @@ static struct davinci_nand_pdata da830_evm_nand_pdata = { .core_chipsel = 1, .parts = da830_evm_nand_partitions, .nr_parts = ARRAY_SIZE(da830_evm_nand_partitions), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .ecc_bits = 4, .bbt_options = NAND_BBT_USE_FLASH, .bbt_td = &da830_evm_nand_bbt_main_descr, diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c index 6751292e5f8f..428012687a80 100644 --- a/arch/arm/mach-davinci/board-da850-evm.c +++ b/arch/arm/mach-davinci/board-da850-evm.c @@ -239,7 +239,7 @@ static struct davinci_nand_pdata da850_evm_nandflash_data = { .core_chipsel = 1, .parts = da850_evm_nandflash_partition, .nr_parts = ARRAY_SIZE(da850_evm_nandflash_partition), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .ecc_bits = 4, .bbt_options = NAND_BBT_USE_FLASH, .timing = &da850_evm_nandflash_timing, diff --git a/arch/arm/mach-davinci/board-dm355-evm.c b/arch/arm/mach-davinci/board-dm355-evm.c index 5113273fda69..3c5a9e3c128a 100644 --- a/arch/arm/mach-davinci/board-dm355-evm.c +++ b/arch/arm/mach-davinci/board-dm355-evm.c @@ -82,7 +82,7 @@ static struct davinci_nand_pdata davinci_nand_data = { .mask_chipsel = BIT(14), .parts = davinci_nand_partitions, .nr_parts = ARRAY_SIZE(davinci_nand_partitions), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .bbt_options = NAND_BBT_USE_FLASH, .ecc_bits = 4, }; diff --git a/arch/arm/mach-davinci/board-dm355-leopard.c b/arch/arm/mach-davinci/board-dm355-leopard.c index b9e9950dd300..e475b2113e70 100644 --- a/arch/arm/mach-davinci/board-dm355-leopard.c +++ b/arch/arm/mach-davinci/board-dm355-leopard.c @@ -76,7 +76,8 @@ static struct davinci_nand_pdata davinci_nand_data = { .mask_chipsel = BIT(14), .parts = davinci_nand_partitions, .nr_parts = ARRAY_SIZE(davinci_nand_partitions), - .ecc_mode = NAND_ECC_HW_SYNDROME, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, + .ecc_placement = NAND_ECC_PLACEMENT_INTERLEAVED, .ecc_bits = 4, .bbt_options = NAND_BBT_USE_FLASH, }; diff --git a/arch/arm/mach-davinci/board-dm365-evm.c b/arch/arm/mach-davinci/board-dm365-evm.c index 2328b15ac067..bdf31eb77620 100644 --- a/arch/arm/mach-davinci/board-dm365-evm.c +++ b/arch/arm/mach-davinci/board-dm365-evm.c @@ -146,7 +146,7 @@ static struct davinci_nand_pdata davinci_nand_data = { .mask_chipsel = BIT(14), .parts = davinci_nand_partitions, .nr_parts = ARRAY_SIZE(davinci_nand_partitions), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .bbt_options = NAND_BBT_USE_FLASH, .ecc_bits = 4, }; diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c index a5d3708fedf6..bcb3c4070945 100644 --- a/arch/arm/mach-davinci/board-dm644x-evm.c +++ b/arch/arm/mach-davinci/board-dm644x-evm.c @@ -162,7 +162,7 @@ static struct davinci_nand_pdata davinci_evm_nandflash_data = { .core_chipsel = 0, .parts = davinci_evm_nandflash_partition, .nr_parts = ARRAY_SIZE(davinci_evm_nandflash_partition), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .ecc_bits = 1, .bbt_options = NAND_BBT_USE_FLASH, .timing = &davinci_evm_nandflash_timing, diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c index dd7d60f4139a..8319a6067a68 100644 --- a/arch/arm/mach-davinci/board-dm646x-evm.c +++ b/arch/arm/mach-davinci/board-dm646x-evm.c @@ -91,7 +91,7 @@ static struct davinci_nand_pdata davinci_nand_data = { .mask_ale = 0x40000, .parts = davinci_nand_partitions, .nr_parts = ARRAY_SIZE(davinci_nand_partitions), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .ecc_bits = 1, .options = 0, }; diff --git a/arch/arm/mach-davinci/board-mityomapl138.c b/arch/arm/mach-davinci/board-mityomapl138.c index 3382b93d9a2a..5205008c8061 100644 --- a/arch/arm/mach-davinci/board-mityomapl138.c +++ b/arch/arm/mach-davinci/board-mityomapl138.c @@ -432,7 +432,7 @@ static struct davinci_nand_pdata mityomapl138_nandflash_data = { .core_chipsel = 1, .parts = mityomapl138_nandflash_partition, .nr_parts = ARRAY_SIZE(mityomapl138_nandflash_partition), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .bbt_options = NAND_BBT_USE_FLASH, .options = NAND_BUSWIDTH_16, .ecc_bits = 1, /* 4 bit mode is not supported with 16 bit NAND */ diff --git a/arch/arm/mach-davinci/board-neuros-osd2.c b/arch/arm/mach-davinci/board-neuros-osd2.c index 6cf46bbc7e1d..b4843f68bb57 100644 --- a/arch/arm/mach-davinci/board-neuros-osd2.c +++ b/arch/arm/mach-davinci/board-neuros-osd2.c @@ -90,7 +90,7 @@ static struct davinci_nand_pdata davinci_ntosd2_nandflash_data = { .core_chipsel = 0, .parts = davinci_ntosd2_nandflash_partition, .nr_parts = ARRAY_SIZE(davinci_ntosd2_nandflash_partition), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .ecc_bits = 1, .bbt_options = NAND_BBT_USE_FLASH, }; diff --git a/arch/arm/mach-davinci/board-omapl138-hawk.c b/arch/arm/mach-davinci/board-omapl138-hawk.c index 6c79039002c9..88df8011a4e6 100644 --- a/arch/arm/mach-davinci/board-omapl138-hawk.c +++ b/arch/arm/mach-davinci/board-omapl138-hawk.c @@ -206,7 +206,7 @@ static struct davinci_nand_pdata omapl138_hawk_nandflash_data = { .core_chipsel = 1, .parts = omapl138_hawk_nandflash_partition, .nr_parts = ARRAY_SIZE(omapl138_hawk_nandflash_partition), - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, .ecc_bits = 4, .bbt_options = NAND_BBT_USE_FLASH, .options = NAND_BUSWIDTH_16, diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c index feb206bdf6e1..bb368938fc49 100644 --- a/arch/arm/mach-davinci/devices-da8xx.c +++ b/arch/arm/mach-davinci/devices-da8xx.c @@ -10,7 +10,7 @@ #include <linux/clk-provider.h> #include <linux/clk.h> #include <linux/clkdev.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/dmaengine.h> #include <linux/init.h> #include <linux/io.h> @@ -884,6 +884,7 @@ early_param("rproc_mem", early_rproc_mem); void __init da8xx_rproc_reserve_cma(void) { + struct cma *cma; int ret; if (!rproc_base || !rproc_size) { @@ -897,13 +898,16 @@ void __init da8xx_rproc_reserve_cma(void) pr_info("%s: reserving 0x%lx @ 0x%lx...\n", __func__, rproc_size, (unsigned long)rproc_base); - ret = dma_declare_contiguous(&da8xx_dsp.dev, rproc_size, rproc_base, 0); - if (ret) - pr_err("%s: dma_declare_contiguous failed %d\n", __func__, ret); - else - rproc_mem_inited = true; + ret = dma_contiguous_reserve_area(rproc_size, rproc_base, 0, &cma, + true); + if (ret) { + pr_err("%s: dma_contiguous_reserve_area failed %d\n", + __func__, ret); + return; + } + da8xx_dsp.dev.cma_area = cma; + rproc_mem_inited = true; } - #else void __init da8xx_rproc_reserve_cma(void) diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index 56bf29523c65..db607955a7e4 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -5,7 +5,7 @@ #include <linux/clk.h> #include <linux/clkdev.h> #include <linux/clocksource.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/input.h> #include <linux/io.h> #include <linux/irqchip.h> diff --git a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c index 3da4c0920198..a329e50928b6 100644 --- a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c +++ b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c @@ -16,7 +16,7 @@ #include <linux/input.h> #include <linux/gpio.h> #include <linux/delay.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/leds.h> #include <linux/platform_data/asoc-mx27vis.h> #include <sound/tlv320aic32x4.h> diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c index 96845a4eaf57..7f780ad2d459 100644 --- a/arch/arm/mach-imx/mach-mx31moboard.c +++ b/arch/arm/mach-imx/mach-mx31moboard.c @@ -4,7 +4,7 @@ */ #include <linux/delay.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/gfp.h> #include <linux/gpio.h> #include <linux/init.h> diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 184262d660ba..000f672a94c9 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -29,6 +29,7 @@ #include <linux/sched_clock.h> #include <linux/irqchip/irq-ixp4xx.h> #include <linux/platform_data/timer-ixp4xx.h> +#include <linux/dma-map-ops.h> #include <mach/udc.h> #include <mach/hardware.h> #include <mach/io.h> diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c index 638808c4e122..09a65c2dfd73 100644 --- a/arch/arm/mach-keystone/keystone.c +++ b/arch/arm/mach-keystone/keystone.c @@ -8,6 +8,7 @@ */ #include <linux/io.h> #include <linux/of.h> +#include <linux/dma-mapping.h> #include <linux/init.h> #include <linux/of_platform.h> #include <linux/of_address.h> @@ -24,8 +25,7 @@ #include "keystone.h" -static unsigned long keystone_dma_pfn_offset __read_mostly; - +#ifdef CONFIG_ARM_LPAE static int keystone_platform_notifier(struct notifier_block *nb, unsigned long event, void *data) { @@ -38,9 +38,12 @@ static int keystone_platform_notifier(struct notifier_block *nb, return NOTIFY_BAD; if (!dev->of_node) { - dev->dma_pfn_offset = keystone_dma_pfn_offset; - dev_err(dev, "set dma_pfn_offset%08lx\n", - dev->dma_pfn_offset); + int ret = dma_direct_set_offset(dev, KEYSTONE_HIGH_PHYS_START, + KEYSTONE_LOW_PHYS_START, + KEYSTONE_HIGH_PHYS_SIZE); + dev_err(dev, "set dma_offset%08llx%s\n", + KEYSTONE_HIGH_PHYS_START - KEYSTONE_LOW_PHYS_START, + ret ? " failed" : ""); } return NOTIFY_OK; } @@ -48,14 +51,14 @@ static int keystone_platform_notifier(struct notifier_block *nb, static struct notifier_block platform_nb = { .notifier_call = keystone_platform_notifier, }; +#endif /* CONFIG_ARM_LPAE */ static void __init keystone_init(void) { - if (PHYS_OFFSET >= KEYSTONE_HIGH_PHYS_START) { - keystone_dma_pfn_offset = PFN_DOWN(KEYSTONE_HIGH_PHYS_START - - KEYSTONE_LOW_PHYS_START); +#ifdef CONFIG_ARM_LPAE + if (PHYS_OFFSET >= KEYSTONE_HIGH_PHYS_START) bus_register_notifier(&platform_bus_type, &platform_nb); - } +#endif keystone_pm_runtime_init(); } diff --git a/arch/arm/mach-mvebu/coherency.c b/arch/arm/mach-mvebu/coherency.c index 8f8748a0c84f..49e3c8d20c2f 100644 --- a/arch/arm/mach-mvebu/coherency.c +++ b/arch/arm/mach-mvebu/coherency.c @@ -25,7 +25,7 @@ #include <linux/of_address.h> #include <linux/io.h> #include <linux/smp.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/mbus.h> diff --git a/arch/arm/mach-omap1/include/mach/memory.h b/arch/arm/mach-omap1/include/mach/memory.h index 1142560e0078..36bc0000cb6a 100644 --- a/arch/arm/mach-omap1/include/mach/memory.h +++ b/arch/arm/mach-omap1/include/mach/memory.h @@ -14,42 +14,11 @@ * OMAP-1510 bus address is translated into a Local Bus address if the * OMAP bus type is lbus. We do the address translation based on the * device overriding the defaults used in the dma-mapping API. - * Note that the is_lbus_device() test is not very efficient on 1510 - * because of the strncmp(). */ -#if defined(CONFIG_ARCH_OMAP15XX) && !defined(__ASSEMBLER__) /* * OMAP-1510 Local Bus address offset */ #define OMAP1510_LB_OFFSET UL(0x30000000) -#define virt_to_lbus(x) ((x) - PAGE_OFFSET + OMAP1510_LB_OFFSET) -#define lbus_to_virt(x) ((x) - OMAP1510_LB_OFFSET + PAGE_OFFSET) -#define is_lbus_device(dev) (cpu_is_omap15xx() && dev && (strncmp(dev_name(dev), "ohci", 4) == 0)) - -#define __arch_pfn_to_dma(dev, pfn) \ - ({ dma_addr_t __dma = __pfn_to_phys(pfn); \ - if (is_lbus_device(dev)) \ - __dma = __dma - PHYS_OFFSET + OMAP1510_LB_OFFSET; \ - __dma; }) - -#define __arch_dma_to_pfn(dev, addr) \ - ({ dma_addr_t __dma = addr; \ - if (is_lbus_device(dev)) \ - __dma += PHYS_OFFSET - OMAP1510_LB_OFFSET; \ - __phys_to_pfn(__dma); \ - }) - -#define __arch_dma_to_virt(dev, addr) ({ (void *) (is_lbus_device(dev) ? \ - lbus_to_virt(addr) : \ - __phys_to_virt(addr)); }) - -#define __arch_virt_to_dma(dev, addr) ({ unsigned long __addr = (unsigned long)(addr); \ - (dma_addr_t) (is_lbus_device(dev) ? \ - virt_to_lbus(__addr) : \ - __virt_to_phys(__addr)); }) - -#endif /* CONFIG_ARCH_OMAP15XX */ - #endif diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c index d8e9bbda8f7b..ba8566204ea9 100644 --- a/arch/arm/mach-omap1/usb.c +++ b/arch/arm/mach-omap1/usb.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/platform_device.h> +#include <linux/dma-mapping.h> #include <linux/io.h> #include <asm/irq.h> @@ -542,6 +543,25 @@ bad: /* ULPD_APLL_CTRL */ #define APLL_NDPLL_SWITCH (1 << 0) +static int omap_1510_usb_ohci_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct device *dev = data; + + if (event != BUS_NOTIFY_ADD_DEVICE) + return NOTIFY_DONE; + + if (strncmp(dev_name(dev), "ohci", 4) == 0 && + dma_direct_set_offset(dev, PHYS_OFFSET, OMAP1510_LB_OFFSET, + (u64)-1)) + WARN_ONCE(1, "failed to set DMA offset\n"); + return NOTIFY_OK; +} + +static struct notifier_block omap_1510_usb_ohci_nb = { + .notifier_call = omap_1510_usb_ohci_notifier, +}; + static void __init omap_1510_usb_init(struct omap_usb_config *config) { unsigned int val; @@ -600,6 +620,8 @@ static void __init omap_1510_usb_init(struct omap_usb_config *config) if (config->register_host) { int status; + bus_register_notifier(&platform_bus_type, + &omap_1510_usb_ohci_nb); ohci_device.dev.platform_data = config; status = platform_device_register(&ohci_device); if (status) diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index 3d2c108e911e..431709725d02 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -369,6 +369,15 @@ static struct pxaficp_platform_data tosa_ficp_platform_data = { /* * Tosa AC IN */ +static struct gpiod_lookup_table tosa_power_gpiod_table = { + .dev_id = "gpio-charger", + .table = { + GPIO_LOOKUP("gpio-pxa", TOSA_GPIO_AC_IN, + NULL, GPIO_ACTIVE_LOW), + { }, + }, +}; + static char *tosa_ac_supplied_to[] = { "main-battery", "backup-battery", @@ -378,8 +387,6 @@ static char *tosa_ac_supplied_to[] = { static struct gpio_charger_platform_data tosa_power_data = { .name = "charger", .type = POWER_SUPPLY_TYPE_MAINS, - .gpio = TOSA_GPIO_AC_IN, - .gpio_active_low = 1, .supplied_to = tosa_ac_supplied_to, .num_supplicants = ARRAY_SIZE(tosa_ac_supplied_to), }; @@ -951,6 +958,7 @@ static void __init tosa_init(void) clk_add_alias("CLK_CK3P6MI", tc6393xb_device.name, "GPIO11_CLK", NULL); gpiod_add_lookup_table(&tosa_udc_gpiod_table); + gpiod_add_lookup_table(&tosa_power_gpiod_table); platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/arch/arm/mach-s3c24xx/common-smdk.c b/arch/arm/mach-s3c24xx/common-smdk.c index 75064dfaceb1..121646ad1bb1 100644 --- a/arch/arm/mach-s3c24xx/common-smdk.c +++ b/arch/arm/mach-s3c24xx/common-smdk.c @@ -191,7 +191,7 @@ static struct s3c2410_platform_nand smdk_nand_info = { .twrph1 = 20, .nr_sets = ARRAY_SIZE(smdk_nand_sets), .sets = smdk_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; /* devices we initialise */ diff --git a/arch/arm/mach-s3c24xx/mach-anubis.c b/arch/arm/mach-s3c24xx/mach-anubis.c index 072966dcad78..28326241e360 100644 --- a/arch/arm/mach-s3c24xx/mach-anubis.c +++ b/arch/arm/mach-s3c24xx/mach-anubis.c @@ -218,7 +218,7 @@ static struct s3c2410_platform_nand __initdata anubis_nand_info = { .nr_sets = ARRAY_SIZE(anubis_nand_sets), .sets = anubis_nand_sets, .select_chip = anubis_nand_select, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; /* IDE channels */ diff --git a/arch/arm/mach-s3c24xx/mach-at2440evb.c b/arch/arm/mach-s3c24xx/mach-at2440evb.c index 58c5ef3cf1d7..04dedebdb57c 100644 --- a/arch/arm/mach-s3c24xx/mach-at2440evb.c +++ b/arch/arm/mach-s3c24xx/mach-at2440evb.c @@ -109,7 +109,7 @@ static struct s3c2410_platform_nand __initdata at2440evb_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(at2440evb_nand_sets), .sets = at2440evb_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; /* DM9000AEP 10/100 ethernet controller */ diff --git a/arch/arm/mach-s3c24xx/mach-bast.c b/arch/arm/mach-s3c24xx/mach-bast.c index a7c3955ae8f6..6465eab0ab3a 100644 --- a/arch/arm/mach-s3c24xx/mach-bast.c +++ b/arch/arm/mach-s3c24xx/mach-bast.c @@ -294,7 +294,7 @@ static struct s3c2410_platform_nand __initdata bast_nand_info = { .nr_sets = ARRAY_SIZE(bast_nand_sets), .sets = bast_nand_sets, .select_chip = bast_nand_select, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; /* DM9000 */ diff --git a/arch/arm/mach-s3c24xx/mach-gta02.c b/arch/arm/mach-s3c24xx/mach-gta02.c index 594901f3b8e5..732748170751 100644 --- a/arch/arm/mach-s3c24xx/mach-gta02.c +++ b/arch/arm/mach-s3c24xx/mach-gta02.c @@ -15,6 +15,7 @@ #include <linux/delay.h> #include <linux/timer.h> #include <linux/init.h> +#include <linux/gpio/machine.h> #include <linux/gpio.h> #include <linux/gpio_keys.h> #include <linux/workqueue.h> @@ -416,7 +417,7 @@ static struct s3c2410_platform_nand __initdata gta02_nand_info = { .twrph1 = 15, .nr_sets = ARRAY_SIZE(gta02_nand_sets), .sets = gta02_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; @@ -474,6 +475,20 @@ static struct platform_device gta02_buttons_device = { }, }; +static struct gpiod_lookup_table gta02_audio_gpio_table = { + .dev_id = "neo1973-audio", + .table = { + GPIO_LOOKUP("GPIOJ", 2, "amp-shut", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("GPIOJ", 1, "hp", GPIO_ACTIVE_HIGH), + { }, + }, +}; + +static struct platform_device gta02_audio = { + .name = "neo1973-audio", + .id = -1, +}; + static void __init gta02_map_io(void) { s3c24xx_init_io(gta02_iodesc, ARRAY_SIZE(gta02_iodesc)); @@ -498,6 +513,7 @@ static struct platform_device *gta02_devices[] __initdata = { >a02_buttons_device, &s3c_device_adc, &s3c_device_ts, + >a02_audio, }; static void gta02_poweroff(void) @@ -524,6 +540,7 @@ static void __init gta02_machine_init(void) i2c_register_board_info(0, gta02_i2c_devs, ARRAY_SIZE(gta02_i2c_devs)); + gpiod_add_lookup_table(>a02_audio_gpio_table); platform_add_devices(gta02_devices, ARRAY_SIZE(gta02_devices)); pm_power_off = gta02_poweroff; diff --git a/arch/arm/mach-s3c24xx/mach-h1940.c b/arch/arm/mach-s3c24xx/mach-h1940.c index f4710052843a..ecb84029e15c 100644 --- a/arch/arm/mach-s3c24xx/mach-h1940.c +++ b/arch/arm/mach-s3c24xx/mach-h1940.c @@ -475,6 +475,22 @@ static struct gpiod_lookup_table h1940_mmc_gpio_table = { }, }; +static struct gpiod_lookup_table h1940_audio_gpio_table = { + .dev_id = "h1940-audio", + .table = { + GPIO_LOOKUP("H1940_LATCH", + H1940_LATCH_AUDIO_POWER - H1940_LATCH_GPIO(0), + "speaker-power", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("GPIOG", 4, "hp", GPIO_ACTIVE_HIGH), + { }, + }, +}; + +static struct platform_device h1940_audio = { + .name = "h1940-audio", + .id = -1, +}; + static struct pwm_lookup h1940_pwm_lookup[] = { PWM_LOOKUP("samsung-pwm", 0, "pwm-backlight", NULL, 36296, PWM_POLARITY_NORMAL), @@ -651,6 +667,7 @@ static struct platform_device *h1940_devices[] __initdata = { &s3c_device_ts, &power_supply, &h1940_battery, + &h1940_audio, }; static void __init h1940_map_io(void) @@ -690,6 +707,7 @@ static void __init h1940_init(void) s3c24xx_fb_set_platdata(&h1940_fb_info); gpiod_add_lookup_table(&h1940_mmc_gpio_table); + gpiod_add_lookup_table(&h1940_audio_gpio_table); s3c24xx_mci_set_platdata(&h1940_mmc_cfg); s3c24xx_udc_set_platdata(&h1940_udc_cfg); s3c24xx_ts_set_platdata(&h1940_ts_cfg); diff --git a/arch/arm/mach-s3c24xx/mach-jive.c b/arch/arm/mach-s3c24xx/mach-jive.c index 885e8f12e4b9..8233dcff19e7 100644 --- a/arch/arm/mach-s3c24xx/mach-jive.c +++ b/arch/arm/mach-s3c24xx/mach-jive.c @@ -228,7 +228,7 @@ static struct s3c2410_platform_nand __initdata jive_nand_info = { .twrph1 = 40, .sets = jive_nand_sets, .nr_sets = ARRAY_SIZE(jive_nand_sets), - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static int __init jive_mtdset(char *options) diff --git a/arch/arm/mach-s3c24xx/mach-mini2440.c b/arch/arm/mach-s3c24xx/mach-mini2440.c index 235749448311..057dcbaf1b22 100644 --- a/arch/arm/mach-s3c24xx/mach-mini2440.c +++ b/arch/arm/mach-s3c24xx/mach-mini2440.c @@ -296,7 +296,7 @@ static struct s3c2410_platform_nand mini2440_nand_info __initdata = { .nr_sets = ARRAY_SIZE(mini2440_nand_sets), .sets = mini2440_nand_sets, .ignore_unset_ecc = 1, - .ecc_mode = NAND_ECC_HW, + .engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST, }; /* DM9000AEP 10/100 ethernet controller */ diff --git a/arch/arm/mach-s3c24xx/mach-osiris.c b/arch/arm/mach-s3c24xx/mach-osiris.c index ee3630cb236a..157448827f61 100644 --- a/arch/arm/mach-s3c24xx/mach-osiris.c +++ b/arch/arm/mach-s3c24xx/mach-osiris.c @@ -234,7 +234,7 @@ static struct s3c2410_platform_nand __initdata osiris_nand_info = { .nr_sets = ARRAY_SIZE(osiris_nand_sets), .sets = osiris_nand_sets, .select_chip = osiris_nand_select, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; /* PCMCIA control and configuration */ diff --git a/arch/arm/mach-s3c24xx/mach-qt2410.c b/arch/arm/mach-s3c24xx/mach-qt2410.c index ff9e3197309b..f3131d94e90b 100644 --- a/arch/arm/mach-s3c24xx/mach-qt2410.c +++ b/arch/arm/mach-s3c24xx/mach-qt2410.c @@ -287,7 +287,7 @@ static struct s3c2410_platform_nand __initdata qt2410_nand_info = { .twrph1 = 20, .nr_sets = ARRAY_SIZE(qt2410_nand_sets), .sets = qt2410_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; /* UDC */ diff --git a/arch/arm/mach-s3c24xx/mach-rx1950.c b/arch/arm/mach-s3c24xx/mach-rx1950.c index fde98b175c75..3645b9c838d9 100644 --- a/arch/arm/mach-s3c24xx/mach-rx1950.c +++ b/arch/arm/mach-s3c24xx/mach-rx1950.c @@ -620,7 +620,7 @@ static struct s3c2410_platform_nand rx1950_nand_info = { .twrph1 = 15, .nr_sets = ARRAY_SIZE(rx1950_nand_sets), .sets = rx1950_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static struct s3c2410_udc_mach_info rx1950_udc_cfg __initdata = { @@ -728,6 +728,20 @@ static struct i2c_board_info rx1950_i2c_devices[] = { }, }; +static struct gpiod_lookup_table rx1950_audio_gpio_table = { + .dev_id = "rx1950-audio", + .table = { + GPIO_LOOKUP("GPIOG", 12, "hp-gpio", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("GPIOA", 1, "speaker-power", GPIO_ACTIVE_HIGH), + { }, + }, +}; + +static struct platform_device rx1950_audio = { + .name = "rx1950-audio", + .id = -1, +}; + static struct platform_device *rx1950_devices[] __initdata = { &s3c2410_device_dclk, &s3c_device_lcd, @@ -746,6 +760,7 @@ static struct platform_device *rx1950_devices[] __initdata = { &power_supply, &rx1950_battery, &rx1950_leds, + &rx1950_audio, }; static void __init rx1950_map_io(void) @@ -813,6 +828,7 @@ static void __init rx1950_init_machine(void) gpio_direction_output(S3C2410_GPJ(6), 0); pwm_add_table(rx1950_pwm_lookup, ARRAY_SIZE(rx1950_pwm_lookup)); + gpiod_add_lookup_table(&rx1950_audio_gpio_table); platform_add_devices(rx1950_devices, ARRAY_SIZE(rx1950_devices)); i2c_register_board_info(0, rx1950_i2c_devices, diff --git a/arch/arm/mach-s3c24xx/mach-rx3715.c b/arch/arm/mach-s3c24xx/mach-rx3715.c index 995f1ff34a1b..017010d67e01 100644 --- a/arch/arm/mach-s3c24xx/mach-rx3715.c +++ b/arch/arm/mach-s3c24xx/mach-rx3715.c @@ -158,7 +158,7 @@ static struct s3c2410_platform_nand __initdata rx3715_nand_info = { .twrph1 = 15, .nr_sets = ARRAY_SIZE(rx3715_nand_sets), .sets = rx3715_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static struct platform_device *rx3715_devices[] __initdata = { diff --git a/arch/arm/mach-s3c24xx/mach-vstms.c b/arch/arm/mach-s3c24xx/mach-vstms.c index d76b28b65e65..c5fa215a527e 100644 --- a/arch/arm/mach-s3c24xx/mach-vstms.c +++ b/arch/arm/mach-s3c24xx/mach-vstms.c @@ -112,7 +112,7 @@ static struct s3c2410_platform_nand __initdata vstms_nand_info = { .twrph1 = 20, .nr_sets = ARRAY_SIZE(vstms_nand_sets), .sets = vstms_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static struct platform_device *vstms_devices[] __initdata = { diff --git a/arch/arm/mach-s3c64xx/mach-hmt.c b/arch/arm/mach-s3c64xx/mach-hmt.c index e7080215c624..0d9acaf91701 100644 --- a/arch/arm/mach-s3c64xx/mach-hmt.c +++ b/arch/arm/mach-s3c64xx/mach-hmt.c @@ -199,7 +199,7 @@ static struct s3c2410_platform_nand hmt_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(hmt_nand_sets), .sets = hmt_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static struct gpio_led hmt_leds[] = { diff --git a/arch/arm/mach-s3c64xx/mach-mini6410.c b/arch/arm/mach-s3c64xx/mach-mini6410.c index 0dd36ae49e6a..6fbb57878746 100644 --- a/arch/arm/mach-s3c64xx/mach-mini6410.c +++ b/arch/arm/mach-s3c64xx/mach-mini6410.c @@ -136,7 +136,7 @@ static struct s3c2410_platform_nand mini6410_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(mini6410_nand_sets), .sets = mini6410_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static struct s3c_fb_pd_win mini6410_lcd_type0_fb_win = { diff --git a/arch/arm/mach-s3c64xx/mach-real6410.c b/arch/arm/mach-s3c64xx/mach-real6410.c index 0ff88b6859c4..1e98e530a6aa 100644 --- a/arch/arm/mach-s3c64xx/mach-real6410.c +++ b/arch/arm/mach-s3c64xx/mach-real6410.c @@ -188,7 +188,7 @@ static struct s3c2410_platform_nand real6410_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(real6410_nand_sets), .sets = real6410_nand_sets, - .ecc_mode = NAND_ECC_SOFT, + .engine_type = NAND_ECC_ENGINE_TYPE_SOFT, }; static struct platform_device *real6410_devices[] __initdata = { diff --git a/arch/arm/mach-sa1100/collie.c b/arch/arm/mach-sa1100/collie.c index 3cc2b71e16f0..bd3a52fd09ce 100644 --- a/arch/arm/mach-sa1100/collie.c +++ b/arch/arm/mach-sa1100/collie.c @@ -30,6 +30,7 @@ #include <linux/gpio_keys.h> #include <linux/input.h> #include <linux/gpio.h> +#include <linux/gpio/machine.h> #include <linux/power/gpio-charger.h> #include <video/sa1100fb.h> @@ -131,16 +132,23 @@ static struct irda_platform_data collie_ir_data = { /* * Collie AC IN */ +static struct gpiod_lookup_table collie_power_gpiod_table = { + .dev_id = "gpio-charger", + .table = { + GPIO_LOOKUP("gpio", COLLIE_GPIO_AC_IN, + NULL, GPIO_ACTIVE_HIGH), + { }, + }, +}; + static char *collie_ac_supplied_to[] = { "main-battery", "backup-battery", }; - static struct gpio_charger_platform_data collie_power_data = { .name = "charger", .type = POWER_SUPPLY_TYPE_MAINS, - .gpio = COLLIE_GPIO_AC_IN, .supplied_to = collie_ac_supplied_to, .num_supplicants = ARRAY_SIZE(collie_ac_supplied_to), }; @@ -386,6 +394,8 @@ static void __init collie_init(void) platform_scoop_config = &collie_pcmcia_config; + gpiod_add_lookup_table(&collie_power_gpiod_table); + ret = platform_add_devices(devices, ARRAY_SIZE(devices)); if (ret) { printk(KERN_WARNING "collie: Unable to register LoCoMo device\n"); diff --git a/arch/arm/mach-shmobile/setup-rcar-gen2.c b/arch/arm/mach-shmobile/setup-rcar-gen2.c index c42ff8c314c8..e00f5b3b9293 100644 --- a/arch/arm/mach-shmobile/setup-rcar-gen2.c +++ b/arch/arm/mach-shmobile/setup-rcar-gen2.c @@ -9,7 +9,7 @@ #include <linux/clocksource.h> #include <linux/device.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/io.h> #include <linux/kernel.h> #include <linux/memblock.h> diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index 12c26eb88afb..43d91bfd2360 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -1249,20 +1249,28 @@ static void __init l2c310_of_parse(const struct device_node *np, ret = of_property_read_u32(np, "prefetch-data", &val); if (ret == 0) { - if (val) + if (val) { prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH; - else + *aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH; + } else { prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH; + *aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH; + } + *aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH; } else if (ret != -EINVAL) { pr_err("L2C-310 OF prefetch-data property value is missing\n"); } ret = of_property_read_u32(np, "prefetch-instr", &val); if (ret == 0) { - if (val) + if (val) { prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH; - else + *aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH; + } else { prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH; + *aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH; + } + *aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH; } else if (ret != -EINVAL) { pr_err("L2C-310 OF prefetch-instr property value is missing\n"); } diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index 287ef898a55e..6bfd2b884505 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/scatterlist.h> #include <asm/cachetype.h> @@ -176,6 +177,8 @@ static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist const struct dma_map_ops arm_nommu_dma_ops = { .alloc = arm_nommu_dma_alloc, .free = arm_nommu_dma_free, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, .mmap = arm_nommu_dma_mmap, .map_page = arm_nommu_dma_map_page, .unmap_page = arm_nommu_dma_unmap_page, diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 8a8949174b1c..c4b8df2ad328 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -15,9 +15,7 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/dma-direct.h> -#include <linux/dma-mapping.h> -#include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/highmem.h> #include <linux/memblock.h> #include <linux/slab.h> @@ -35,7 +33,6 @@ #include <asm/dma-iommu.h> #include <asm/mach/map.h> #include <asm/system_info.h> -#include <asm/dma-contiguous.h> #include <xen/swiotlb-xen.h> #include "dma.h" @@ -199,6 +196,8 @@ static int arm_dma_supported(struct device *dev, u64 mask) const struct dma_map_ops arm_dma_ops = { .alloc = arm_dma_alloc, .free = arm_dma_free, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, .mmap = arm_dma_mmap, .get_sgtable = arm_dma_get_sgtable, .map_page = arm_dma_map_page, @@ -226,6 +225,8 @@ static int arm_coherent_dma_mmap(struct device *dev, struct vm_area_struct *vma, const struct dma_map_ops arm_coherent_dma_ops = { .alloc = arm_coherent_dma_alloc, .free = arm_coherent_dma_free, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, .mmap = arm_coherent_dma_mmap, .get_sgtable = arm_dma_get_sgtable, .map_page = arm_coherent_dma_map_page, diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 45f9d5ec2360..d57112a276f5 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -18,7 +18,7 @@ #include <linux/highmem.h> #include <linux/gfp.h> #include <linux/memblock.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/sizes.h> #include <linux/stop_machine.h> #include <linux/swiotlb.h> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 698cc740c6b8..ab69250a86bc 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -17,7 +17,6 @@ #include <asm/cp15.h> #include <asm/cputype.h> -#include <asm/sections.h> #include <asm/cachetype.h> #include <asm/fixmap.h> #include <asm/sections.h> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 171077cbf419..d056a548358e 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -453,3 +453,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index a54f70731d9f..150ce6e6a5d3 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -19,7 +19,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO32 ldflags-$(CONFIG_CPU_ENDIAN_BE8) := --be8 ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ -z max-page-size=4096 -nostdlib -shared $(ldflags-y) \ - --hash-style=sysv --build-id \ + --hash-style=sysv --build-id=sha1 \ -T obj-$(CONFIG_VDSO) += vdso.o diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index d3ef975a0965..467fa225c3d0 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/cpu.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/export.h> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 893130ce1626..f858c352f72a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -123,6 +123,7 @@ config ARM64 select GENERIC_VDSO_TIME_NS select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND + select HAVE_MOVE_PMD select HAVE_PCI select HAVE_ACPI_APEI if (ACPI && EFI) select HAVE_ALIGNED_STRUCT_PAGE if SLUB @@ -194,6 +195,7 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY + select SET_FS select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index f4717facf31e..5789c2d18d43 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -10,13 +10,13 @@ # # Copyright (C) 1995-2001 by Russell King -LDFLAGS_vmlinux :=--no-undefined -X +LDFLAGS_vmlinux :=--no-undefined -X -z norelro ifeq ($(CONFIG_RELOCATABLE), y) # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour # for relative relocs, since this leads to better Image compression # with the relocation offsets always being zero. -LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \ +LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \ $(call ld-option, --no-apply-dynamic-relocs) endif @@ -126,10 +126,6 @@ endif CHECKFLAGS += -D__aarch64__ -ifeq ($(CONFIG_ARM64_MODULE_PLTS),y) -KBUILD_LDS_MODULE += $(srctree)/arch/arm64/kernel/module.lds -endif - ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y) KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY CC_FLAGS_FTRACE := -fpatchable-function-entry=2 diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi index 250fc01de78d..24aab3ea3f52 100644 --- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi +++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi @@ -795,8 +795,8 @@ reg = <0x27>; interrupt-parent = <&gpa1>; interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; - s3fwrn5,en-gpios = <&gpf1 4 GPIO_ACTIVE_HIGH>; - s3fwrn5,fw-gpios = <&gpj0 2 GPIO_ACTIVE_HIGH>; + en-gpios = <&gpf1 4 GPIO_ACTIVE_HIGH>; + wake-gpios = <&gpj0 2 GPIO_ACTIVE_HIGH>; }; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts index c2dc1232f93f..1efb61cff454 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts @@ -199,6 +199,7 @@ &enetc_port0 { phy-handle = <&sgmii_phy0>; phy-connection-type = "sgmii"; + managed = "in-band-status"; status = "okay"; mdio { diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index 2bbc69b4dc99..d5b6c0a1c54a 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -316,7 +316,7 @@ }; pcie_reset_pins: pcie-reset-pins { - groups = "pcie1"; + groups = "pcie1"; /* this actually controls "pcie1_reset" */ function = "gpio"; }; diff --git a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts index d174ad214857..9a11e5c60c26 100644 --- a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts +++ b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts @@ -143,6 +143,56 @@ mdio: mdio-bus { #address-cells = <1>; #size-cells = <0>; + + switch@0 { + compatible = "mediatek,mt7531"; + reg = <0>; + reset-gpios = <&pio 54 0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "wan"; + }; + + port@1 { + reg = <1>; + label = "lan0"; + }; + + port@2 { + reg = <2>; + label = "lan1"; + }; + + port@3 { + reg = <3>; + label = "lan2"; + }; + + port@4 { + reg = <4>; + label = "lan3"; + }; + + port@6 { + reg = <6>; + label = "cpu"; + ethernet = <&gmac0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; + }; + }; + }; + }; }; diff --git a/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts b/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts index 0b4de627f96e..08ad0ffb24df 100644 --- a/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts +++ b/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts @@ -105,20 +105,71 @@ pinctrl-0 = <ð_pins>; status = "okay"; - gmac1: mac@1 { + gmac0: mac@0 { compatible = "mediatek,eth-mac"; - reg = <1>; - phy-handle = <&phy5>; + reg = <0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; }; mdio-bus { #address-cells = <1>; #size-cells = <0>; - phy5: ethernet-phy@5 { - reg = <5>; - phy-mode = "sgmii"; + switch@0 { + compatible = "mediatek,mt7531"; + reg = <0>; + reset-gpios = <&pio 54 0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "lan0"; + }; + + port@1 { + reg = <1>; + label = "lan1"; + }; + + port@2 { + reg = <2>; + label = "lan2"; + }; + + port@3 { + reg = <3>; + label = "lan3"; + }; + + port@4 { + reg = <4>; + label = "wan"; + }; + + port@6 { + reg = <6>; + label = "cpu"; + ethernet = <&gmac0>; + phy-mode = "2500base-x"; + + fixed-link { + speed = <2500>; + full-duplex; + pause; + }; + }; + }; }; + }; }; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 54d181177656..ddbe6bf00e33 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -219,6 +219,23 @@ lr .req x30 // link register .endm /* + * @dst: destination register + */ +#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) + .macro this_cpu_offset, dst + mrs \dst, tpidr_el2 + .endm +#else + .macro this_cpu_offset, dst +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + mrs \dst, tpidr_el1 +alternative_else + mrs \dst, tpidr_el2 +alternative_endif + .endm +#endif + + /* * @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP) * @sym: The name of the per-cpu variable * @tmp: scratch register @@ -226,11 +243,7 @@ lr .req x30 // link register .macro adr_this_cpu, dst, sym, tmp adrp \tmp, \sym add \dst, \tmp, #:lo12:\sym -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs \tmp, tpidr_el1 -alternative_else - mrs \tmp, tpidr_el2 -alternative_endif + this_cpu_offset \tmp add \dst, \dst, \tmp .endm @@ -241,11 +254,7 @@ alternative_endif */ .macro ldr_this_cpu dst, sym, tmp adr_l \dst, \sym -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs \tmp, tpidr_el1 -alternative_else - mrs \tmp, tpidr_el2 -alternative_endif + this_cpu_offset \tmp ldr \dst, [\dst, \tmp] .endm diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index fb4c27506ef4..c3009b0e5239 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -45,6 +45,7 @@ #define rmb() dsb(ld) #define wmb() dsb(st) +#define dma_mb() dmb(osh) #define dma_rmb() dmb(oshld) #define dma_wmb() dmb(oshst) diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h new file mode 100644 index 000000000000..daa1a1da539e --- /dev/null +++ b/arch/arm64/include/asm/hyp_image.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil <dbrazdil@google.com> + */ + +#ifndef __ARM64_HYP_IMAGE_H__ +#define __ARM64_HYP_IMAGE_H__ + +/* + * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, + * to separate it from the kernel proper. + */ +#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym + +#ifdef LINKER_SCRIPT + +/* + * KVM nVHE ELF section names are prefixed with .hyp, to separate them + * from the kernel proper. + */ +#define HYP_SECTION_NAME(NAME) .hyp##NAME + +/* Defines an ELF hyp section from input section @NAME and its subsections. */ +#define HYP_SECTION(NAME) \ + HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) } + +/* + * Defines a linker script alias of a kernel-proper symbol referenced by + * KVM nVHE hyp code. + */ +#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym; + +#endif /* LINKER_SCRIPT */ + +#endif /* __ARM64_HYP_IMAGE_H__ */ diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index ff50dd731852..fd172c41df90 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -110,6 +110,7 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #define __io_par(v) __iormb(v) #define __iowmb() dma_wmb() +#define __iomb() dma_mb() /* * Relaxed I/O memory access primitives. These follow the Device memory diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7f7072f6cb45..54387ccd1ab2 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -7,6 +7,7 @@ #ifndef __ARM_KVM_ASM_H__ #define __ARM_KVM_ASM_H__ +#include <asm/hyp_image.h> #include <asm/virt.h> #define ARM_EXIT_WITH_SERROR_BIT 31 @@ -35,17 +36,34 @@ #define __SMCCC_WORKAROUND_1_SMC_SZ 36 +#define KVM_HOST_SMCCC_ID(id) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + (id)) + +#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name) + +#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0 +#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 +#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 +#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 + #ifndef __ASSEMBLY__ #include <linux/mm.h> -/* - * Translate name of a symbol defined in nVHE hyp to the name seen - * by kernel proper. All nVHE symbols are prefixed by the build system - * to avoid clashes with the VHE variants. - */ -#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym - #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] #define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[] @@ -57,10 +75,53 @@ DECLARE_KVM_VHE_SYM(sym); \ DECLARE_KVM_NVHE_SYM(sym) +#define DECLARE_KVM_VHE_PER_CPU(type, sym) \ + DECLARE_PER_CPU(type, sym) +#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \ + DECLARE_PER_CPU(type, kvm_nvhe_sym(sym)) + +#define DECLARE_KVM_HYP_PER_CPU(type, sym) \ + DECLARE_KVM_VHE_PER_CPU(type, sym); \ + DECLARE_KVM_NVHE_PER_CPU(type, sym) + +/* + * Compute pointer to a symbol defined in nVHE percpu region. + * Returns NULL if percpu memory has not been allocated yet. + */ +#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id()) +#define per_cpu_ptr_nvhe_sym(sym, cpu) \ + ({ \ + unsigned long base, off; \ + base = kvm_arm_hyp_percpu_base[cpu]; \ + off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \ + (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \ + base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \ + }) + +#if defined(__KVM_NVHE_HYPERVISOR__) + +#define CHOOSE_NVHE_SYM(sym) sym +#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym) + +/* The nVHE hypervisor shouldn't even try to access VHE symbols */ +extern void *__nvhe_undefined_symbol; +#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol +#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol) +#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol) + +#elif defined(__KVM_VHE_HYPERVISOR__) + #define CHOOSE_VHE_SYM(sym) sym -#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) +#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym) + +/* The VHE hypervisor shouldn't even try to access nVHE symbols */ +extern void *__vhe_undefined_symbol; +#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol +#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol) +#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol) + +#else -#ifndef __KVM_NVHE_HYPERVISOR__ /* * BIG FAT WARNINGS: * @@ -72,12 +133,21 @@ * - Don't let the nVHE hypervisor have access to this, as it will * pick the *wrong* symbol (yes, it runs at EL2...). */ -#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \ +#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \ + ? CHOOSE_VHE_SYM(sym) \ : CHOOSE_NVHE_SYM(sym)) -#else -/* The nVHE hypervisor shouldn't even try to access anything */ -extern void *__nvhe_undefined_symbol; -#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol + +#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \ + ? this_cpu_ptr(&sym) \ + : this_cpu_ptr_nvhe_sym(sym)) + +#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \ + ? per_cpu_ptr(&sym, cpu) \ + : per_cpu_ptr_nvhe_sym(sym, cpu)) + +#define CHOOSE_VHE_SYM(sym) sym +#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) + #endif /* Translate a kernel address @ptr into its equivalent linear mapping */ @@ -95,10 +165,16 @@ struct kvm_vcpu; struct kvm_s2_mmu; DECLARE_KVM_NVHE_SYM(__kvm_hyp_init); +DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector); DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) +#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) +extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +DECLARE_KVM_NVHE_SYM(__per_cpu_start); +DECLARE_KVM_NVHE_SYM(__per_cpu_end); + extern atomic_t arm64_el2_vector_last_slot; DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) @@ -144,26 +220,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; addr; \ }) -/* - * Home-grown __this_cpu_{ptr,read} variants that always work at HYP, - * provided that sym is really a *symbol* and not a pointer obtained from - * a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps - * sparse quiet. - */ -#define __hyp_this_cpu_ptr(sym) \ - ({ \ - void *__ptr; \ - __verify_pcpu_ptr(&sym); \ - __ptr = hyp_symbol_addr(sym); \ - __ptr += read_sysreg(tpidr_el2); \ - (typeof(sym) __kernel __force *)__ptr; \ - }) - -#define __hyp_this_cpu_read(sym) \ - ({ \ - *__hyp_this_cpu_ptr(sym); \ - }) - #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ " .align 3\n" \ @@ -194,20 +250,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; #else /* __ASSEMBLY__ */ -.macro hyp_adr_this_cpu reg, sym, tmp - adr_l \reg, \sym - mrs \tmp, tpidr_el2 - add \reg, \reg, \tmp -.endm - -.macro hyp_ldr_this_cpu reg, sym, tmp - adr_l \reg, \sym - mrs \tmp, tpidr_el2 - ldr \reg, [\reg, \tmp] -.endm - .macro get_host_ctxt reg, tmp - hyp_adr_this_cpu \reg, kvm_host_data, \tmp + adr_this_cpu \reg, kvm_host_data, \tmp add \reg, \reg, #HOST_DATA_CONTEXT .endm @@ -216,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] .endm +.macro get_loaded_vcpu vcpu, ctxt + adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu + ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] +.endm + +.macro set_loaded_vcpu vcpu, ctxt, tmp + adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp + str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] +.endm + /* * KVM extable for unexpected exceptions. * In the same format _asm_extable, but output to a different section so that @@ -231,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; .popsection .endm +#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x) +#define CPU_LR_OFFSET CPU_XREG_OFFSET(30) +#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8) + +/* + * We treat x18 as callee-saved as the host may use it as a platform + * register (e.g. for shadow call stack). + */ +.macro save_callee_saved_regs ctxt + str x18, [\ctxt, #CPU_XREG_OFFSET(18)] + stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] + stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] + stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] + stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] + stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] + stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] +.endm + +.macro restore_callee_saved_regs ctxt + // We require \ctxt is not x18-x28 + ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)] + ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] + ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] + ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] + ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] + ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] + ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] +.endm + +.macro save_sp_el0 ctxt, tmp + mrs \tmp, sp_el0 + str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] +.endm + +.macro restore_sp_el0 ctxt, tmp + ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] + msr sp_el0, \tmp +.endm + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bb5e5b88d439..0aecbab6a7fb 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -11,6 +11,7 @@ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ +#include <linux/arm-smccc.h> #include <linux/bitmap.h> #include <linux/types.h> #include <linux/jump_label.h> @@ -79,8 +80,8 @@ struct kvm_s2_mmu { * for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the * canonical stage-2 page tables. */ - pgd_t *pgd; phys_addr_t pgd_phys; + struct kvm_pgtable *pgt; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -110,6 +111,13 @@ struct kvm_arch { * supported. */ bool return_nisv_io_abort_to_user; + + /* + * VM-wide PMU filter, implemented as a bitmap and big enough for + * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). + */ + unsigned long *pmu_filter; + unsigned int pmuver; }; struct kvm_vcpu_fault_info { @@ -262,8 +270,6 @@ struct kvm_host_data { struct kvm_pmu_events pmu_events; }; -typedef struct kvm_host_data kvm_host_data_t; - struct vcpu_reset_state { unsigned long pc; unsigned long r0; @@ -480,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -u64 __kvm_call_hyp(void *hypfn, ...); - -#define kvm_call_hyp_nvhe(f, ...) \ - do { \ - DECLARE_KVM_NVHE_SYM(f); \ - __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \ - } while(0) - -#define kvm_call_hyp_nvhe_ret(f, ...) \ +#define kvm_call_hyp_nvhe(f, ...) \ ({ \ - DECLARE_KVM_NVHE_SYM(f); \ - __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \ + struct arm_smccc_res res; \ + \ + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \ + ##__VA_ARGS__, &res); \ + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ + \ + res.a1; \ }) /* @@ -517,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...); ret = f(__VA_ARGS__); \ isb(); \ } else { \ - ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \ + ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ \ ret; \ @@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data); +DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data); static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 46689e7db46c..6b664de5ec1f 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -12,6 +12,9 @@ #include <asm/alternative.h> #include <asm/sysreg.h> +DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); + #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ @@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); void deactivate_traps_vhe_put(void); #endif -u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); +u64 __guest_enter(struct kvm_vcpu *vcpu); -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt); +void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(unsigned long, ...); +void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index cff1cebc7590..331394306cce 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -44,16 +44,6 @@ * HYP_VA_MIN = 1 << (VA_BITS - 1) * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 * - * This of course assumes that the trampoline page exists within the - * VA_BITS range. If it doesn't, then it means we're in the odd case - * where the kernel idmap (as well as HYP) uses more levels than the - * kernel runtime page tables (as seen when the kernel is configured - * for 4k pages, 39bits VA, and yet memory lives just above that - * limit, forcing the idmap to use 4 levels of page tables while the - * kernel itself only uses 3). In this particular case, it doesn't - * matter which side of VA_BITS we use, as we're guaranteed not to - * conflict with anything. - * * When using VHE, there are no separate hyp mappings and all KVM * functionality is already mapped as part of the main kernel * mappings, and none of this applies in that case. @@ -118,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - +#include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> -int create_hyp_mappings(void *from, void *to, pgprot_t prot); +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); @@ -142,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); - phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); -void kvm_clear_hyp_idmap(void); - -#define kvm_mk_pmd(ptep) \ - __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) -#define kvm_mk_pud(pmdp) \ - __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) -#define kvm_mk_p4d(pmdp) \ - __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE) - -#define kvm_set_pud(pudp, pud) set_pud(pudp, pud) - -#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) -#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) -#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) - -#define kvm_pud_pfn(pud) pud_pfn(pud) - -#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) -#define kvm_pud_mkhuge(pud) pud_mkhuge(pud) - -static inline pte_t kvm_s2pte_mkwrite(pte_t pte) -{ - pte_val(pte) |= PTE_S2_RDWR; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) -{ - pmd_val(pmd) |= PMD_S2_RDWR; - return pmd; -} - -static inline pud_t kvm_s2pud_mkwrite(pud_t pud) -{ - pud_val(pud) |= PUD_S2_RDWR; - return pud; -} - -static inline pte_t kvm_s2pte_mkexec(pte_t pte) -{ - pte_val(pte) &= ~PTE_S2_XN; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) -{ - pmd_val(pmd) &= ~PMD_S2_XN; - return pmd; -} - -static inline pud_t kvm_s2pud_mkexec(pud_t pud) -{ - pud_val(pud) &= ~PUD_S2_XN; - return pud; -} - -static inline void kvm_set_s2pte_readonly(pte_t *ptep) -{ - pteval_t old_pteval, pteval; - - pteval = READ_ONCE(pte_val(*ptep)); - do { - old_pteval = pteval; - pteval &= ~PTE_S2_RDWR; - pteval |= PTE_S2_RDONLY; - pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); - } while (pteval != old_pteval); -} - -static inline bool kvm_s2pte_readonly(pte_t *ptep) -{ - return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY; -} - -static inline bool kvm_s2pte_exec(pte_t *ptep) -{ - return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN); -} - -static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp) -{ - kvm_set_s2pte_readonly((pte_t *)pmdp); -} - -static inline bool kvm_s2pmd_readonly(pmd_t *pmdp) -{ - return kvm_s2pte_readonly((pte_t *)pmdp); -} - -static inline bool kvm_s2pmd_exec(pmd_t *pmdp) -{ - return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); -} - -static inline void kvm_set_s2pud_readonly(pud_t *pudp) -{ - kvm_set_s2pte_readonly((pte_t *)pudp); -} - -static inline bool kvm_s2pud_readonly(pud_t *pudp) -{ - return kvm_s2pte_readonly((pte_t *)pudp); -} - -static inline bool kvm_s2pud_exec(pud_t *pudp) -{ - return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); -} - -static inline pud_t kvm_s2pud_mkyoung(pud_t pud) -{ - return pud_mkyoung(pud); -} - -static inline bool kvm_s2pud_young(pud_t pud) -{ - return pud_young(pud); -} - -#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) - -#ifdef __PAGETABLE_PMD_FOLDED -#define hyp_pmd_table_empty(pmdp) (0) -#else -#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#endif - -#ifdef __PAGETABLE_PUD_FOLDED -#define hyp_pud_table_empty(pudp) (0) -#else -#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) -#endif - -#ifdef __PAGETABLE_P4D_FOLDED -#define hyp_p4d_table_empty(p4dp) (0) -#else -#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp) -#endif struct kvm; @@ -326,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, } } -static inline void __kvm_flush_dcache_pte(pte_t pte) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pte_page(pte); - kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE); - } -} - -static inline void __kvm_flush_dcache_pmd(pmd_t pmd) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pmd_page(pmd); - kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE); - } -} - -static inline void __kvm_flush_dcache_pud(pud_t pud) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pud_page(pud); - kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE); - } -} - void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); -static inline bool __kvm_cpu_uses_extended_idmap(void) -{ - return __cpu_uses_extended_idmap_level(); -} - -static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) -{ - return idmap_ptrs_per_pgd; -} - -/* - * Can't use pgd_populate here, because the extended idmap adds an extra level - * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended - * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4. - */ -static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, - pgd_t *hyp_pgd, - pgd_t *merged_hyp_pgd, - unsigned long hyp_idmap_start) -{ - int idmap_idx; - u64 pgd_addr; - - /* - * Use the first entry to access the HYP mappings. It is - * guaranteed to be free, otherwise we wouldn't use an - * extended idmap. - */ - VM_BUG_ON(pgd_val(merged_hyp_pgd[0])); - pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd)); - merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE); - - /* - * Create another extended level entry that points to the boot HYP map, - * which contains an ID mapping of the HYP init code. We essentially - * merge the boot and runtime HYP maps by doing so, but they don't - * overlap anyway, so this is fine. - */ - idmap_idx = hyp_idmap_start >> VA_BITS; - VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx])); - pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd)); - merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE); -} - static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); @@ -479,30 +256,6 @@ static inline void *kvm_get_hyp_vector(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) -/* - * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. - * With v8.2 LVA extensions, 'x' should be a minimum of 6 with - * 52bit IPS. - */ -static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) -{ - int x = ARM64_VTTBR_X(ipa_shift, levels); - - return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; -} - -static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) -{ - unsigned int x = arm64_vttbr_x(ipa_shift, levels); - - return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); -} - -static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) -{ - return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); -} - static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h new file mode 100644 index 000000000000..52ab38db04c7 --- /dev/null +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + */ + +#ifndef __ARM64_KVM_PGTABLE_H__ +#define __ARM64_KVM_PGTABLE_H__ + +#include <linux/bits.h> +#include <linux/kvm_host.h> +#include <linux/types.h> + +typedef u64 kvm_pte_t; + +/** + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + */ +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pte_t *pgd; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; +}; + +/** + * enum kvm_pgtable_prot - Page-table permissions and attributes. + * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_W: Write permission. + * @KVM_PGTABLE_PROT_R: Read permission. + * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + */ +enum kvm_pgtable_prot { + KVM_PGTABLE_PROT_X = BIT(0), + KVM_PGTABLE_PROT_W = BIT(1), + KVM_PGTABLE_PROT_R = BIT(2), + + KVM_PGTABLE_PROT_DEVICE = BIT(3), +}; + +#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) +#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) +#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) + +/** + * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. + * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid + * entries. + * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their + * children. + * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their + * children. + */ +enum kvm_pgtable_walk_flags { + KVM_PGTABLE_WALK_LEAF = BIT(0), + KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), + KVM_PGTABLE_WALK_TABLE_POST = BIT(2), +}; + +typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg); + +/** + * struct kvm_pgtable_walker - Hook into a page-table walk. + * @cb: Callback function to invoke during the walk. + * @arg: Argument passed to the callback function. + * @flags: Bitwise-OR of flags to identify the entry types on which to + * invoke the callback function. + */ +struct kvm_pgtable_walker { + const kvm_pgtable_visitor_fn_t cb; + void * const arg; + const enum kvm_pgtable_walk_flags flags; +}; + +/** + * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. + * @pgt: Uninitialised page-table structure to initialise. + * @va_bits: Maximum virtual address bits. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits); + +/** + * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); + +/** + * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * @addr: Virtual address at which to place the mapping. + * @size: Size of the mapping. + * @phys: Physical address of the memory to map. + * @prot: Permissions and attributes for the mapping. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. Attempts to install a new mapping + * for a virtual address that is already mapped will be rejected with an + * error and a WARN(). + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot); + +/** + * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. + * @pgt: Uninitialised page-table structure to initialise. + * @kvm: KVM structure representing the guest virtual machine. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm); + +/** + * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); + +/** + * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * @phys: Physical address of the memory to map. + * @prot: Permissions and attributes for the mapping. + * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to + * allocate page-table pages. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. + * + * Note that this function will both coalesce existing table entries and split + * existing block mappings, relying on page-faults to fault back areas outside + * of the new mapping lazily. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *mc); + +/** + * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to remove the mapping. + * @size: Size of the mapping. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * TLB invalidation is performed for each page-table entry cleared during the + * unmapping operation and the reference count for the page-table page + * containing the cleared entry is decremented, with unreferenced pages being + * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if + * FWB is not supported by the CPU. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range + * without TLB invalidation. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to write-protect, + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the updated permissions are visible + * to the CPUs. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * set the access flag in that entry. + * + * Return: The old page-table entry prior to setting the flag, 0 on failure. + */ +kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * clear the access flag in that entry. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the updated permissions are visible + * to the CPUs. + * + * Return: The old page-table entry prior to clearing the flag, 0 on failure. + */ +kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a + * page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * @prot: Additional permissions to grant for the mapping. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * relax the permissions in that entry according to the read, write and + * execute permissions specified by @prot. No permissions are removed, and + * TLB invalidation is performed after updating the entry. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot); + +/** + * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the + * access flag set. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * Return: True if the page-table entry has the access flag set, false otherwise. + */ +bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point + * of Coherency for guest stage-2 address + * range. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to flush. + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_walk() - Walk a page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). + * @addr: Input address for the start of the walk. + * @size: Size of the range to walk. + * @walker: Walker callback description. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * The walker will walk the page-table entries corresponding to the input + * address range specified, visiting entries according to the walker flags. + * Invalid entries are treated as leaf entries. Leaf entries are reloaded + * after invoking the walker callback, allowing the walker to descend into + * a newly installed table. + * + * Returning a negative error code from the walker callback function will + * terminate the walk immediately with the same error code. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker); + +#endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h index 0ddf98c3ba9f..0cd0965255d2 100644 --- a/arch/arm64/include/asm/kvm_ptrauth.h +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -60,7 +60,7 @@ .endm /* - * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will + * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will * check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as * (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and * then proceed ahead with the save/restore of Pointer Authentication @@ -78,7 +78,7 @@ alternative_else_nop_endif .L__skip_switch\@: .endm -.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 alternative_if_not ARM64_HAS_ADDRESS_AUTH b .L__skip_switch\@ alternative_else_nop_endif @@ -96,7 +96,7 @@ alternative_else_nop_endif #else /* !CONFIG_ARM64_PTR_AUTH */ .macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3 .endm -.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 .endm #endif /* CONFIG_ARM64_PTR_AUTH */ #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 43640d797455..cd61239bae8c 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -169,7 +169,6 @@ extern u64 vabits_actual; #define PAGE_END (_PAGE_END(vabits_actual)) -extern s64 physvirt_offset; extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) @@ -245,7 +244,7 @@ static inline const void *__tag_set(const void *addr, u8 tag) */ #define __is_lm_address(addr) (!(((u64)addr) & BIT(vabits_actual - 1))) -#define __lm_to_phys(addr) (((addr) + physvirt_offset)) +#define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET) #define __kimg_to_phys(addr) ((addr) - kimage_voffset) #define __virt_to_phys_nodebug(x) ({ \ @@ -263,7 +262,7 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define __phys_addr_symbol(x) __pa_symbol_nodebug(x) #endif /* CONFIG_DEBUG_VIRTUAL */ -#define __phys_to_virt(x) ((unsigned long)((x) - physvirt_offset)) +#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET) #define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/include/asm/module.lds.h index 22e36a21c113..691f15af788e 100644 --- a/arch/arm64/kernel/module.lds +++ b/arch/arm64/include/asm/module.lds.h @@ -1,5 +1,7 @@ +#ifdef CONFIG_ARM64_MODULE_PLTS SECTIONS { .plt (NOLOAD) : { BYTE(0) } .init.plt (NOLOAD) : { BYTE(0) } .text.ftrace_trampoline (NOLOAD) : { BYTE(0) } } +#endif diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 0b6409b89e5e..1599e17379d8 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off) :: "r" (off) : "memory"); } -static inline unsigned long __my_cpu_offset(void) +static inline unsigned long __hyp_my_cpu_offset(void) +{ + /* + * Non-VHE hyp code runs with preemption disabled. No need to hazard + * the register access against barrier() as in __kern_my_cpu_offset. + */ + return read_sysreg(tpidr_el2); +} + +static inline unsigned long __kern_my_cpu_offset(void) { unsigned long off; @@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void) return off; } -#define __my_cpu_offset __my_cpu_offset() + +#ifdef __KVM_NVHE_HYPERVISOR__ +#define __my_cpu_offset __hyp_my_cpu_offset() +#else +#define __my_cpu_offset __kern_my_cpu_offset() +#endif #define PERCPU_RW_OPS(sz) \ static inline unsigned long __percpu_read_##sz(void *ptr) \ @@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd) #include <asm-generic/percpu.h> +/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */ +#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT) +#undef this_cpu_ptr +#define this_cpu_ptr raw_cpu_ptr +#undef __this_cpu_read +#define __this_cpu_read raw_cpu_read +#undef __this_cpu_write +#define __this_cpu_write raw_cpu_write +#endif + #endif /* __ASM_PERCPU_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 94b3f2ac2e9d..01a96d07ae74 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -146,7 +146,6 @@ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ -#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #ifdef CONFIG_ARM64_PA_BITS_52 @@ -163,34 +162,11 @@ #define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2) /* - * 2nd stage PTE definitions - */ -#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ -#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ -#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */ -#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */ - -#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ -#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ -#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ -#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */ - -#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ -#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ -#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ - -/* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) /* - * EL2/HYP PTE/PMD definitions - */ -#define PMD_HYP PMD_SECT_USER -#define PTE_HYP PTE_USER - -/* * Highest possible physical address supported. */ #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 4cd0d6ca8aa1..046be789fbb4 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -64,7 +64,6 @@ extern bool arm64_use_ng_mappings; #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) -#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT #define PAGE_KERNEL __pgprot(PROT_NORMAL) #define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED) @@ -73,11 +72,6 @@ extern bool arm64_use_ng_mappings; #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) #define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) -#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) -#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN) - #define PAGE_S2_MEMATTR(attr) \ ({ \ u64 __val; \ @@ -88,19 +82,6 @@ extern bool arm64_use_ng_mappings; __val; \ }) -#define PAGE_S2_XN \ - ({ \ - u64 __val; \ - if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \ - __val = 0; \ - else \ - __val = PTE_S2_XN; \ - __val; \ - }) - -#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN) -#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) - #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index a11bf52e0c38..4ff12a7adcfd 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -24,6 +24,8 @@ #define VMALLOC_START (MODULES_END) #define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K) +#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) + #define FIRST_USER_ADDRESS 0UL #ifndef __ASSEMBLY__ @@ -34,8 +36,6 @@ #include <linux/mm_types.h> #include <linux/sched.h> -extern struct page *vmemmap; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 996bf98f0cab..fe341a6578c3 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -8,7 +8,6 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ -#include <linux/hugetlb.h> #include <linux/pgtable.h> /* @@ -37,217 +36,12 @@ #define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) /* - * The number of PTRS across all concatenated stage2 tables given by the - * number of bits resolved at the initial level. - * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), - * in which case, stage2_pgd_ptrs will have one entry. - */ -#define pgd_ptrs_shift(ipa, pgdir_shift) \ - ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) -#define __s2_pgd_ptrs(ipa, lvls) \ - (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) -#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) - -#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) -#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) - -/* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at * the VM creation. */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) -/* Stage2 PUD definitions when the level is present */ -static inline bool kvm_stage2_has_pud(struct kvm *kvm) -{ - return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); -} - -#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) - -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) -#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d) - -static inline p4d_t *stage2_p4d_offset(struct kvm *kvm, - pgd_t *pgd, unsigned long address) -{ - return p4d_offset(pgd, address); -} - -static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d) -{ -} - -static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp) -{ - return false; -} - -static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm, - phys_addr_t addr, phys_addr_t end) -{ - return end; -} - -static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d) -{ - if (kvm_stage2_has_pud(kvm)) - return p4d_none(p4d); - else - return 0; -} - -static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp) -{ - if (kvm_stage2_has_pud(kvm)) - p4d_clear(p4dp); -} - -static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d) -{ - if (kvm_stage2_has_pud(kvm)) - return p4d_present(p4d); - else - return 1; -} - -static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud) -{ - if (kvm_stage2_has_pud(kvm)) - p4d_populate(NULL, p4d, pud); -} - -static inline pud_t *stage2_pud_offset(struct kvm *kvm, - p4d_t *p4d, unsigned long address) -{ - if (kvm_stage2_has_pud(kvm)) - return pud_offset(p4d, address); - else - return (pud_t *)p4d; -} - -static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) -{ - if (kvm_stage2_has_pud(kvm)) - free_page((unsigned long)pud); -} - -static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) -{ - if (kvm_stage2_has_pud(kvm)) - return kvm_page_empty(pudp); - else - return false; -} - -static inline phys_addr_t -stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - if (kvm_stage2_has_pud(kvm)) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; - } else { - return end; - } -} - -/* Stage2 PMD definitions when the level is present */ -static inline bool kvm_stage2_has_pmd(struct kvm *kvm) -{ - return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); -} - -#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) - -static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_none(pud); - else - return 0; -} - -static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) -{ - if (kvm_stage2_has_pmd(kvm)) - pud_clear(pud); -} - -static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_present(pud); - else - return 1; -} - -static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) -{ - if (kvm_stage2_has_pmd(kvm)) - pud_populate(NULL, pud, pmd); -} - -static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, - pud_t *pud, unsigned long address) -{ - if (kvm_stage2_has_pmd(kvm)) - return pmd_offset(pud, address); - else - return (pmd_t *)pud; -} - -static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) -{ - if (kvm_stage2_has_pmd(kvm)) - free_page((unsigned long)pmd); -} - -static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_huge(pud); - else - return 0; -} - -static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) -{ - if (kvm_stage2_has_pmd(kvm)) - return kvm_page_empty(pmdp); - else - return 0; -} - -static inline phys_addr_t -stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - if (kvm_stage2_has_pmd(kvm)) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; - } else { - return end; - } -} - -static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) -{ - return kvm_page_empty(ptep); -} - -static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) -{ - return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); -} - static inline phys_addr_t stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { @@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) return (boundary - 1 < end - 1) ? boundary : end; } -/* - * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and - * the architectural page-table level. - */ -#define S2_NO_LEVEL_HINT 0 -#define S2_PUD_LEVEL 1 -#define S2_PMD_LEVEL 2 -#define S2_PTE_LEVEL 3 - #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 3b859596840d..b3b2019f8d16 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 440 +#define __NR_compat_syscalls 441 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 2a3ad9b9accd..107f08e03b9f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -887,6 +887,8 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 7d804fd0a682..1c17c3a24411 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -159,6 +159,21 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* + * PMU filter structure. Describe a range of events with a particular + * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. + */ +struct kvm_pmu_event_filter { + __u16 base_event; + __u16 nevents; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + __u8 action; + __u8 pad[3]; +}; + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { @@ -338,6 +353,7 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index a6d688c10745..062b21f30f94 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -678,7 +678,7 @@ int sve_set_current_vl(unsigned long arg) vl = arg & PR_SVE_VL_LEN_MASK; flags = arg & ~vl; - if (!system_supports_sve()) + if (!system_supports_sve() || is_compat_task()) return -EINVAL; ret = sve_set_vector_length(current, vl, flags); @@ -691,7 +691,7 @@ int sve_set_current_vl(unsigned long arg) /* PR_SVE_GET_VL */ int sve_get_current_vl(void) { - if (!system_supports_sve()) + if (!system_supports_sve() || is_compat_task()) return -EINVAL; return sve_prctl_status(0); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 843ecfb16a69..61684a500914 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -61,14 +61,11 @@ __efistub__ctype = _ctype; * memory mappings. */ -#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym; - /* Alternative callbacks for init-time patching of nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ -KVM_NVHE_ALIAS(kvm_host_data); KVM_NVHE_ALIAS(kvm_vgic_global_state); /* Kernel constant needed to compute idmap addresses. */ diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c index 1e77736a4f66..adb955fd9bdd 100644 --- a/arch/arm64/kernel/pointer_auth.c +++ b/arch/arm64/kernel/pointer_auth.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> #include <linux/errno.h> #include <linux/prctl.h> #include <linux/random.h> @@ -17,6 +18,9 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) if (!system_supports_address_auth() && !system_supports_generic_auth()) return -EINVAL; + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + if (!arg) { ptrauth_keys_init_user(keys); return 0; diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 68b710f1b43f..25f3c80b5ffe 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -67,7 +67,8 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2. * - Mitigated in hardware and listed in our "safe list". * - Mitigated in software by firmware. - * - Mitigated in software by a CPU-specific dance in the kernel. + * - Mitigated in software by a CPU-specific dance in the kernel and a + * firmware call at EL2. * - Vulnerable. * * It's not unlikely for different CPUs in a big.LITTLE system to fall into @@ -204,8 +205,8 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn) __SMCCC_WORKAROUND_1_SMC_SZ; /* - * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if - * we're a guest. Skip the hyp-vectors work. + * Vinz Clortho takes the hyp_vecs start/end "keys" at + * the door when we're a guest. Skip the hyp-vectors work. */ if (!is_hyp_mode_available()) { __this_cpu_write(bp_hardening_data.fn, fn); @@ -259,6 +260,16 @@ static void qcom_link_stack_sanitisation(void) : "=&r" (tmp)); } +static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void) +{ + u32 midr = read_cpuid_id(); + if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) && + ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1)) + return NULL; + + return qcom_link_stack_sanitisation; +} + static enum mitigation_state spectre_v2_enable_fw_mitigation(void) { bp_hardening_cb_t cb; @@ -284,26 +295,15 @@ static enum mitigation_state spectre_v2_enable_fw_mitigation(void) return SPECTRE_VULNERABLE; } + /* + * Prefer a CPU-specific workaround if it exists. Note that we + * still rely on firmware for the mitigation at EL2. + */ + cb = spectre_v2_get_sw_mitigation_cb() ?: cb; install_bp_hardening_cb(cb); return SPECTRE_MITIGATED; } -static enum mitigation_state spectre_v2_enable_sw_mitigation(void) -{ - u32 midr; - - if (spectre_v2_mitigations_off()) - return SPECTRE_VULNERABLE; - - midr = read_cpuid_id(); - if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) && - ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1)) - return SPECTRE_VULNERABLE; - - install_bp_hardening_cb(qcom_link_stack_sanitisation); - return SPECTRE_MITIGATED; -} - void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) { enum mitigation_state state; @@ -313,8 +313,6 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) state = spectre_v2_get_cpu_hw_mitigation_state(); if (state == SPECTRE_VULNERABLE) state = spectre_v2_enable_fw_mitigation(); - if (state == SPECTRE_VULNERABLE) - state = spectre_v2_enable_sw_mitigation(); update_mitigation_state(&spectre_v2_state, state); } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index bdcaaf091e1e..a8184cad8890 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -946,7 +946,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, do_signal(regs); if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 04021a93171c..d65f52264aba 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -24,14 +24,13 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so # preparation in build-time C")). ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ - -Bsymbolic $(call ld-option, --no-eh-frame-hdr) --build-id -n \ + -Bsymbolic $(call ld-option, --no-eh-frame-hdr) --build-id=sha1 -n \ $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) -KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y @@ -47,7 +46,7 @@ endif GCOV_PROFILE := n obj-y += vdso.o -extra-y += vdso.lds +targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Force dependency (incbin is bad) diff --git a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh index 0664acaf61ff..8b806eacd0a6 100755 --- a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh +++ b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh @@ -8,7 +8,7 @@ # Doing this inside the Makefile will break the $(filter-out) function, # causing Kbuild to rebuild the vdso-offsets header file every time. # -# Author: Will Deacon <will.deacon@arm.com +# Author: Will Deacon <will.deacon@arm.com> # LC_ALL=C diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index d6adb4677c25..7f96a1a9f68c 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -90,9 +90,9 @@ VDSO_CFLAGS += -O2 # Some useful compiler-dependent flags from top-level Makefile VDSO_CFLAGS += $(call cc32-option,-Wdeclaration-after-statement,) VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) -VDSO_CFLAGS += $(call cc32-option,-fno-strict-overflow) +VDSO_CFLAGS += -fno-strict-overflow VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) -VDSO_CFLAGS += $(call cc32-option,-Werror=date-time) +VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) # The 32-bit compiler does not provide 128-bit integers, which are used in @@ -128,7 +128,7 @@ VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft VDSO_LDFLAGS += -Wl,--hash-style=sysv -VDSO_LDFLAGS += -Wl,--build-id +VDSO_LDFLAGS += -Wl,--build-id=sha1 VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd) @@ -155,7 +155,7 @@ asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) obj-y += vdso.o -extra-y += vdso.lds +targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Force dependency (vdso.s includes vdso.so through incbin) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 5ca957e656ab..6d78c041fdf6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -10,6 +10,7 @@ #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> +#include <asm/hyp_image.h> #include <asm/kernel-pgtable.h> #include <asm/memory.h> #include <asm/page.h> @@ -22,12 +23,23 @@ ENTRY(_text) jiffies = jiffies_64; +#ifdef CONFIG_KVM #define HYPERVISOR_EXTABLE \ . = ALIGN(SZ_8); \ __start___kvm_ex_table = .; \ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; +#define HYPERVISOR_PERCPU_SECTION \ + . = ALIGN(PAGE_SIZE); \ + HYP_SECTION_NAME(.data..percpu) : { \ + *(HYP_SECTION_NAME(.data..percpu)) \ + } +#else /* CONFIG_KVM */ +#define HYPERVISOR_EXTABLE +#define HYPERVISOR_PERCPU_SECTION +#endif + #define HYPERVISOR_TEXT \ /* \ * Align to 4 KB so that \ @@ -196,6 +208,7 @@ SECTIONS } PERCPU_SECTION(L1_CACHE_BYTES) + HYPERVISOR_PERCPU_SECTION .rela.dyn : ALIGN(8) { *(.rela .rela*) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 99977c1972cc..1504c81fbf5d 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ $(KVM)/vfio.o $(KVM)/irqchip.o \ arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ - inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \ + inject_fault.o regmap.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ aarch32.o arch_timer.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index acf9a993dfb6..f56122eedffc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,8 +46,10 @@ __asm__(".arch_extension virt"); #endif -DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); +DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); + static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); @@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { int i; + bitmap_free(kvm->arch.pmu_filter); + kvm_vgic_destroy(kvm); for (i = 0; i < KVM_MAX_VCPUS; ++i) { @@ -286,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) static_branch_dec(&userspace_irqchip_in_use); - kvm_mmu_free_memory_caches(vcpu); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); @@ -1259,6 +1263,19 @@ long kvm_arch_vm_ioctl(struct file *filp, } } +static unsigned long nvhe_percpu_size(void) +{ + return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - + (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); +} + +static unsigned long nvhe_percpu_order(void) +{ + unsigned long size = nvhe_percpu_size(); + + return size ? get_order(size) : 0; +} + static int kvm_map_vectors(void) { /* @@ -1299,6 +1316,7 @@ static void cpu_init_hyp_mode(void) unsigned long hyp_stack_ptr; unsigned long vector_ptr; unsigned long tpidr_el2; + struct arm_smccc_res res; /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); @@ -1308,12 +1326,13 @@ static void cpu_init_hyp_mode(void) * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. */ - tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) - - (unsigned long)kvm_ksym_ref(&kvm_host_data)); + tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - + (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_get_hyp_vector(); + hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr); + vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); /* * Call initialization code, and switch to the full blown HYP code. @@ -1322,7 +1341,9 @@ static void cpu_init_hyp_mode(void) * cpus_have_const_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); - __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), + pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res); + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); /* * Disabling SSBD on a non-VHE system requires us to enable SSBS @@ -1342,10 +1363,12 @@ static void cpu_hyp_reset(void) static void cpu_hyp_reinit(void) { - kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt); + kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); cpu_hyp_reset(); + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector(); + if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); else @@ -1496,8 +1519,10 @@ static void teardown_hyp_mode(void) int cpu; free_hyp_pgds(); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); + } } /** @@ -1531,6 +1556,24 @@ static int init_hyp_mode(void) } /* + * Allocate and initialize pages for Hypervisor-mode percpu regions. + */ + for_each_possible_cpu(cpu) { + struct page *page; + void *page_addr; + + page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); + if (!page) { + err = -ENOMEM; + goto out_err; + } + + page_addr = page_address(page); + memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); + kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; + } + + /* * Map the Hyp-code called directly from the host */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), @@ -1574,14 +1617,17 @@ static int init_hyp_mode(void) } } + /* + * Map Hyp percpu pages + */ for_each_possible_cpu(cpu) { - kvm_host_data_t *cpu_data; + char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; + char *percpu_end = percpu_begin + nvhe_percpu_size(); - cpu_data = per_cpu_ptr(&kvm_host_data, cpu); - err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); + err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); if (err) { - kvm_err("Cannot map host CPU state: %d\n", err); + kvm_err("Cannot map hyp percpu region\n"); goto out_err; } } diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S deleted file mode 100644 index 3c79a1124af2..000000000000 --- a/arch/arm64/kvm/hyp.S +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <linux/linkage.h> - -#include <asm/alternative.h> -#include <asm/assembler.h> -#include <asm/cpufeature.h> - -/* - * u64 __kvm_call_hyp(void *hypfn, ...); - * - * This is not really a variadic function in the classic C-way and care must - * be taken when calling this to ensure parameters are passed in registers - * only, since the stack will change between the caller and the callee. - * - * Call the function with the first argument containing a pointer to the - * function you wish to call in Hyp mode, and subsequent arguments will be - * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the - * function pointer can be passed). The function being called must be mapped - * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are - * passed in x0. - * - * A function pointer with a value less than 0xfff has a special meaning, - * and is used to implement hyp stubs in the same way as in - * arch/arm64/kernel/hyp_stub.S. - */ -SYM_FUNC_START(__kvm_call_hyp) - hvc #0 - ret -SYM_FUNC_END(__kvm_call_hyp) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index d898f0da5802..4a81eddabcd8 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 76e7eaf4675e..b0afad7a99c6 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <asm/alternative.h> -#include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/fpsimdmacros.h> #include <asm/kvm.h> @@ -16,66 +15,28 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_ptrauth.h> -#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x) -#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8) - .text /* - * We treat x18 as callee-saved as the host may use it as a platform - * register (e.g. for shadow call stack). - */ -.macro save_callee_saved_regs ctxt - str x18, [\ctxt, #CPU_XREG_OFFSET(18)] - stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] - stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] - stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] - stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] - stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] - stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] -.endm - -.macro restore_callee_saved_regs ctxt - // We require \ctxt is not x18-x28 - ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)] - ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] - ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] - ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] - ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] - ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] - ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] -.endm - -.macro save_sp_el0 ctxt, tmp - mrs \tmp, sp_el0 - str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] -.endm - -.macro restore_sp_el0 ctxt, tmp - ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] - msr sp_el0, \tmp -.endm - -/* - * u64 __guest_enter(struct kvm_vcpu *vcpu, - * struct kvm_cpu_context *host_ctxt); + * u64 __guest_enter(struct kvm_vcpu *vcpu); */ SYM_FUNC_START(__guest_enter) // x0: vcpu - // x1: host context - // x2-x17: clobbered by macros + // x1-x17: clobbered by macros // x29: guest context - // Store the host regs + adr_this_cpu x1, kvm_hyp_ctxt, x2 + + // Store the hyp regs save_callee_saved_regs x1 - // Save the host's sp_el0 + // Save hyp's sp_el0 save_sp_el0 x1, x2 - // Now the host state is stored if we have a pending RAS SError it must - // affect the host. If any asynchronous exception is pending we defer - // the guest entry. The DSB isn't necessary before v8.2 as any SError - // would be fatal. + // Now the hyp state is stored if we have a pending RAS SError it must + // affect the host or hyp. If any asynchronous exception is pending we + // defer the guest entry. The DSB isn't necessary before v8.2 as any + // SError would be fatal. alternative_if ARM64_HAS_RAS_EXTN dsb nshst isb @@ -86,6 +47,8 @@ alternative_else_nop_endif ret 1: + set_loaded_vcpu x0, x1, x2 + add x29, x0, #VCPU_CONTEXT // Macro ptrauth_switch_to_guest format: @@ -116,6 +79,26 @@ alternative_else_nop_endif eret sb +SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + // If the hyp context is loaded, go straight to hyp_panic + get_loaded_vcpu x0, x1 + cbz x0, hyp_panic + + // The hyp context is saved so make sure it is restored to allow + // hyp_panic to run at hyp and, subsequently, panic to run in the host. + // This makes use of __guest_exit to avoid duplication but sets the + // return address to tail call into hyp_panic. As a side effect, the + // current state is saved to the guest context but it will only be + // accurate if the guest had been completely restored. + adr_this_cpu x0, kvm_hyp_ctxt, x1 + adr x1, hyp_panic + str x1, [x0, #CPU_XREG_OFFSET(30)] + + get_vcpu_ptr x1, x0 + SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x0: return code // x1: vcpu @@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Store the guest's sp_el0 save_sp_el0 x1, x2 - get_host_ctxt x2, x3 + adr_this_cpu x2, kvm_hyp_ctxt, x3 - // Macro ptrauth_switch_to_guest format: - // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3) + // Macro ptrauth_switch_to_hyp format: + // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3) // The below macro to save/restore keys is not implemented in C code // as it may cause Pointer Authentication key signing mismatch errors // when this feature is enabled for kernel code. - ptrauth_switch_to_host x1, x2, x3, x4, x5 + ptrauth_switch_to_hyp x1, x2, x3, x4, x5 - // Restore the hosts's sp_el0 + // Restore hyp's sp_el0 restore_sp_el0 x2, x3 - // Now restore the host regs + // Now restore the hyp regs restore_callee_saved_regs x2 + set_loaded_vcpu xzr, x1, x2 + alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error // without an unmask-SError and isb. The ESB-instruction consumed any diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 7ea277b82967..0a5b36eb54b3 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -12,7 +12,6 @@ #include <asm/cpufeature.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> -#include <asm/kvm_mmu.h> #include <asm/mmu.h> .macro save_caller_saved_regs_vect @@ -41,20 +40,6 @@ .text -.macro do_el2_call - /* - * Shuffle the parameters before calling the function - * pointed to in x0. Assumes parameters in x[1,2,3]. - */ - str lr, [sp, #-16]! - mov lr, x0 - mov x0, x1 - mov x1, x2 - mov x2, x3 - blr lr - ldr lr, [sp], #16 -.endm - el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 @@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap -#ifdef __KVM_NVHE_HYPERVISOR__ - mrs x1, vttbr_el2 // If vttbr is valid, the guest - cbnz x1, el1_hvc_guest // called HVC - - /* Here, we're pretty sure the host called HVC. */ - ldp x0, x1, [sp], #16 - - /* Check for a stub HVC call */ - cmp x0, #HVC_STUB_HCALL_NR - b.hs 1f - - /* - * Compute the idmap address of __kvm_handle_stub_hvc and - * jump there. Since we use kimage_voffset, do not use the - * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead - * (by loading it from the constant pool). - * - * Preserve x0-x4, which may contain stub parameters. - */ - ldr x5, =__kvm_handle_stub_hvc - ldr_l x6, kimage_voffset - - /* x5 = __pa(x5) */ - sub x5, x5, x6 - br x5 - -1: - /* - * Perform the EL2 call - */ - kern_hyp_va x0 - do_el2_call - - eret - sb -#endif /* __KVM_NVHE_HYPERVISOR__ */ - -el1_hvc_guest: /* * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. * The workaround has already been applied on the host, @@ -169,24 +116,7 @@ el2_error: eret sb -#ifdef __KVM_NVHE_HYPERVISOR__ -SYM_FUNC_START(__hyp_do_panic) - mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) - msr spsr_el2, lr - ldr lr, =panic - msr elr_el2, lr - eret - sb -SYM_FUNC_END(__hyp_do_panic) -#endif - -SYM_CODE_START(__hyp_panic) - get_host_ctxt x0, x1 - b hyp_panic -SYM_CODE_END(__hyp_panic) - -.macro invalid_vector label, target = __hyp_panic +.macro invalid_vector label, target = __guest_exit_panic .align 2 SYM_CODE_START(\label) b \target @@ -198,7 +128,6 @@ SYM_CODE_END(\label) invalid_vector el2t_irq_invalid invalid_vector el2t_fiq_invalid invalid_vector el2t_error_invalid - invalid_vector el2h_sync_invalid invalid_vector el2h_irq_invalid invalid_vector el2h_fiq_invalid invalid_vector el1_fiq_invalid @@ -228,10 +157,9 @@ check_preamble_length 661b, 662b .macro invalid_vect target .align 7 661: - b \target nop + stp x0, x1, [sp, #-16]! 662: - ldp x0, x1, [sp], #16 b \target check_preamble_length 661b, 662b diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 5e28ea6aa097..4ebe9f558f3a 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) return; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; guest_ctxt = &vcpu->arch.ctxt; host_dbg = &vcpu->arch.host_debug_state.regs; guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); @@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) return; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; guest_ctxt = &vcpu->arch.ctxt; host_dbg = &vcpu->arch.host_debug_state.regs; guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index d0f07e8cc3ff..313a8fa3c721 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline void __activate_vm(struct kvm_s2_mmu *mmu) -{ - __load_guest_stage2(mmu); -} - static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { u64 par, tmp; @@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr) ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ } while(0) +DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); + static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *ctxt; @@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) return false; - ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + ctxt = this_cpu_ptr(&kvm_hyp_ctxt); __ptrauth_save_key(ctxt, APIA); __ptrauth_save_key(ctxt, APIB); __ptrauth_save_key(ctxt, APDA); @@ -481,14 +478,13 @@ exit: static inline void __kvm_unexpected_el2_exception(void) { + extern char __guest_exit_panic[]; unsigned long addr, fixup; - struct kvm_cpu_context *host_ctxt; struct exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); entry = hyp_symbol_addr(__start___kvm_ex_table); end = hyp_symbol_addr(__stop___kvm_ex_table); - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; while (entry < end) { addr = (unsigned long)&entry->insn + entry->insn; @@ -503,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void) return; } - hyp_panic(host_ctxt); + /* Trigger a panic after restoring the hyp context. */ + write_sysreg(__guest_exit_panic, elr_el2); } #endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/mips/pnx833x/stb22x/Makefile b/arch/arm64/kvm/hyp/nvhe/.gitignore index 7c5ddf36b735..695d73d0249e 100644 --- a/arch/mips/pnx833x/stb22x/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y := board.o +hyp.lds diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index aef76487edc2..ddde15fe85f2 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -6,44 +6,50 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o -obj-y := $(patsubst %.o,%.hyp.o,$(obj-y)) -extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y)) +## +## Build rules for compiling nVHE hyp code +## Output of this folder is `kvm_nvhe.o`, a partially linked object +## file containing all nVHE hyp code and data. +## -$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE +hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) +obj-y := kvm_nvhe.o +extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds + +# 1) Compile all source files to `.nvhe.o` object files. The file extension +# avoids file name clashes for files shared with VHE. +$(obj)/%.nvhe.o: $(src)/%.c FORCE $(call if_changed_rule,cc_o_c) -$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE +$(obj)/%.nvhe.o: $(src)/%.S FORCE $(call if_changed_rule,as_o_S) -$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE - $(call if_changed,hypcopy) -# Disable reordering functions by GCC (enabled at -O2). -# This pass puts functions into '.text.*' sections to aid the linker -# in optimizing ELF layout. See HYPCOPY comment below for more info. -ccflags-y += $(call cc-option,-fno-reorder-functions) +# 2) Compile linker script. +$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE + $(call if_changed_dep,cpp_lds_S) + +# 3) Partially link all '.nvhe.o' files and apply the linker script. +# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'. +# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before +# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to +# keep the dependency on the target while avoiding an error from +# GNU ld if the linker script is passed to it twice. +LDFLAGS_kvm_nvhe.tmp.o := -r -T +$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE + $(call if_changed,ld) + +# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. +# Prefixes names of ELF symbols with '__kvm_nvhe_'. +$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE + $(call if_changed,hypcopy) # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names -# and relevant ELF section names to avoid clashes with VHE code/data. -# -# Hyp code is assumed to be in the '.text' section of the input object -# files (with the exception of specialized sections such as -# '.hyp.idmap.text'). This assumption may be broken by a compiler that -# divides code into sections like '.text.unlikely' so as to optimize -# ELF layout. HYPCOPY checks that no such sections exist in the input -# using `objdump`, otherwise they would be linked together with other -# kernel code and not memory-mapped correctly at runtime. +# to avoid clashes with VHE code/data. quiet_cmd_hypcopy = HYPCOPY $@ - cmd_hypcopy = \ - if $(OBJDUMP) -h $< | grep -F '.text.'; then \ - echo "$@: function reordering not supported in nVHE hyp code" >&2; \ - /bin/false; \ - fi; \ - $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \ - --rename-section=.text=.hyp.text \ - $< $@ + cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ # Remove ftrace and Shadow Call Stack CFLAGS. # This is equivalent to the 'notrace' and '__noscs' annotations. diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S new file mode 100644 index 000000000000..ff9a0f547b9f --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull <ascull@google.com> + */ + +#include <linux/linkage.h> + +#include <asm/assembler.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + + .text + +SYM_FUNC_START(__host_exit) + stp x0, x1, [sp, #-16]! + + get_host_ctxt x0, x1 + + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + + /* Store the host regs x2 and x3 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(2)] + + /* Retrieve the host regs x0-x1 from the stack */ + ldp x2, x3, [sp], #16 // x0, x1 + + /* Store the host regs x0-x1 and x4-x17 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x0, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x0, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x0, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x0, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x0, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x0, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x0, #CPU_XREG_OFFSET(16)] + + /* Store the host regs x18-x29, lr */ + save_callee_saved_regs x0 + + /* Save the host context pointer in x29 across the function call */ + mov x29, x0 + bl handle_trap + + /* Restore host regs x0-x17 */ + ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)] + + /* x0-7 are use for panic arguments */ +__host_enter_for_panic: + ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)] + + /* Restore host regs x18-x29, lr */ + restore_callee_saved_regs x29 + + /* Do not touch any register after this! */ +__host_enter_without_restoring: + eret + sb +SYM_FUNC_END(__host_exit) + +/* + * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + */ +SYM_FUNC_START(__hyp_do_panic) + /* Load the format arguments into x1-7 */ + mov x6, x3 + get_vcpu_ptr x7, x3 + + mrs x3, esr_el2 + mrs x4, far_el2 + mrs x5, hpfar_el2 + + /* Prepare and exit to the host's panic funciton. */ + mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ + PSR_MODE_EL1h) + msr spsr_el2, lr + ldr lr, =panic + msr elr_el2, lr + + /* + * Set the panic format string and enter the host, conditionally + * restoring the host context. + */ + cmp x0, xzr + ldr x0, =__hyp_panic_string + b.eq __host_enter_without_restoring + b __host_enter_for_panic +SYM_FUNC_END(__hyp_do_panic) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + lsr x0, x0, #ESR_ELx_EC_SHIFT + cmp x0, #ESR_ELx_EC_HVC64 + ldp x0, x1, [sp], #16 + b.ne __host_exit + + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.hs __host_exit + + /* + * Compute the idmap address of __kvm_handle_stub_hvc and + * jump there. Since we use kimage_voffset, do not use the + * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead + * (by loading it from the constant pool). + * + * Preserve x0-x4, which may contain stub parameters. + */ + ldr x5, =__kvm_handle_stub_hvc + ldr_l x6, kimage_voffset + + /* x5 = __pa(x5) */ + sub x5, x5, x6 + br x5 +.L__vect_end\@: +.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) + .error "host_el1_sync_vect larger than vector entry" +.endif +.endm + +.macro invalid_host_el2_vect + .align 7 + /* If a guest is loaded, panic out of it. */ + stp x0, x1, [sp, #-16]! + get_loaded_vcpu x0, x1 + cbnz x0, __guest_exit_panic + add sp, sp, #16 + + /* + * The panic may not be clean if the exception is taken before the host + * context has been saved by __host_exit or after the hyp context has + * been partially clobbered by __host_enter. + */ + b hyp_panic +.endm + +.macro invalid_host_el1_vect + .align 7 + mov x0, xzr /* restore_host = false */ + mrs x1, spsr_el2 + mrs x2, elr_el2 + mrs x3, par_el1 + b __hyp_do_panic +.endm + +/* + * The host vector does not use an ESB instruction in order to avoid consuming + * SErrors that should only be consumed by the host. Guest entry is deferred by + * __guest_enter if there are any pending asynchronous exceptions so hyp will + * always return to the host without having consumerd host SErrors. + * + * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the + * host knows about the EL2 vectors already, and there is no point in hiding + * them. + */ + .align 11 +SYM_CODE_START(__kvm_hyp_host_vector) + invalid_host_el2_vect // Synchronous EL2t + invalid_host_el2_vect // IRQ EL2t + invalid_host_el2_vect // FIQ EL2t + invalid_host_el2_vect // Error EL2t + + invalid_host_el2_vect // Synchronous EL2h + invalid_host_el2_vect // IRQ EL2h + invalid_host_el2_vect // FIQ EL2h + invalid_host_el2_vect // Error EL2h + + host_el1_sync_vect // Synchronous 64-bit EL1 + invalid_host_el1_vect // IRQ 64-bit EL1 + invalid_host_el1_vect // FIQ 64-bit EL1 + invalid_host_el1_vect // Error 64-bit EL1 + + invalid_host_el1_vect // Synchronous 32-bit EL1 + invalid_host_el1_vect // IRQ 32-bit EL1 + invalid_host_el1_vect // FIQ 32-bit EL1 + invalid_host_el1_vect // Error 32-bit EL1 +SYM_CODE_END(__kvm_hyp_host_vector) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index d9434e90c06d..47224dc62c51 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -4,11 +4,13 @@ * Author: Marc Zyngier <marc.zyngier@arm.com> */ +#include <linux/arm-smccc.h> #include <linux/linkage.h> #include <asm/alternative.h> #include <asm/assembler.h> #include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> #include <asm/pgtable-hwdef.h> #include <asm/sysreg.h> @@ -44,27 +46,37 @@ __invalid: b . /* - * x0: HYP pgd - * x1: HYP stack - * x2: HYP vectors - * x3: per-CPU offset + * x0: SMCCC function ID + * x1: HYP pgd + * x2: per-CPU offset + * x3: HYP stack + * x4: HYP vectors */ __do_hyp_init: /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc - phys_to_ttbr x4, x0 + /* Set tpidr_el2 for use by HYP to free a register */ + msr tpidr_el2, x2 + + mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) + cmp x0, x2 + b.eq 1f + mov x0, #SMCCC_RET_NOT_SUPPORTED + eret + +1: phys_to_ttbr x0, x1 alternative_if ARM64_HAS_CNP - orr x4, x4, #TTBR_CNP_BIT + orr x0, x0, #TTBR_CNP_BIT alternative_else_nop_endif - msr ttbr0_el2, x4 + msr ttbr0_el2, x0 - mrs x4, tcr_el1 - mov_q x5, TCR_EL2_MASK - and x4, x4, x5 - mov x5, #TCR_EL2_RES1 - orr x4, x4, x5 + mrs x0, tcr_el1 + mov_q x1, TCR_EL2_MASK + and x0, x0, x1 + mov x1, #TCR_EL2_RES1 + orr x0, x0, x1 /* * The ID map may be configured to use an extended virtual address @@ -80,18 +92,18 @@ alternative_else_nop_endif * * So use the same T0SZ value we use for the ID map. */ - ldr_l x5, idmap_t0sz - bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + ldr_l x1, idmap_t0sz + bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH /* * Set the PS bits in TCR_EL2. */ - tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6 + tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 - msr tcr_el2, x4 + msr tcr_el2, x0 - mrs x4, mair_el1 - msr mair_el2, x4 + mrs x0, mair_el1 + msr mair_el2, x0 isb /* Invalidate the stale TLBs from Bootloader */ @@ -103,25 +115,22 @@ alternative_else_nop_endif * as well as the EE bit on BE. Drop the A flag since the compiler * is allowed to generate unaligned accesses. */ - mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) -CPU_BE( orr x4, x4, #SCTLR_ELx_EE) + mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) +CPU_BE( orr x0, x0, #SCTLR_ELx_EE) alternative_if ARM64_HAS_ADDRESS_AUTH - mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ + mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) - orr x4, x4, x5 + orr x0, x0, x1 alternative_else_nop_endif - msr sctlr_el2, x4 + msr sctlr_el2, x0 isb /* Set the stack and new vectors */ - kern_hyp_va x1 - mov sp, x1 - msr vbar_el2, x2 - - /* Set tpidr_el2 for use by HYP */ - msr tpidr_el2, x3 + mov sp, x3 + msr vbar_el2, x4 /* Hello, World! */ + mov x0, #SMCCC_RET_SUCCESS eret SYM_CODE_END(__kvm_hyp_init) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c new file mode 100644 index 000000000000..e2eafe2c93af --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull <ascull@google.com> + */ + +#include <hyp/switch.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +#include <kvm/arm_hypercalls.h> + +static void handle_host_hcall(unsigned long func_id, + struct kvm_cpu_context *host_ctxt) +{ + unsigned long ret = 0; + + switch (func_id) { + case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1; + + ret = __kvm_vcpu_run(kern_hyp_va(vcpu)); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context): + __kvm_flush_vm_context(); + break; + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + phys_addr_t ipa = host_ctxt->regs.regs[2]; + int level = host_ctxt->regs.regs[3]; + + __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + + __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + + __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): { + u64 cntvoff = host_ctxt->regs.regs[1]; + + __kvm_timer_set_cntvoff(cntvoff); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs): + __kvm_enable_ssbs(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2): + ret = __vgic_v3_get_ich_vtr_el2(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr): + ret = __vgic_v3_read_vmcr(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): { + u32 vmcr = host_ctxt->regs.regs[1]; + + __vgic_v3_write_vmcr(vmcr); + break; + } + case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs): + __vgic_v3_init_lrs(); + break; + case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2): + ret = __kvm_get_mdcr_el2(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; + + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); + break; + } + case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; + + __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); + break; + } + default: + /* Invalid host HVC. */ + host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED; + return; + } + + host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS; + host_ctxt->regs.regs[1] = ret; +} + +void handle_trap(struct kvm_cpu_context *host_ctxt) +{ + u64 esr = read_sysreg_el2(SYS_ESR); + unsigned long func_id; + + if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64) + hyp_panic(); + + func_id = host_ctxt->regs.regs[0]; + handle_host_hcall(func_id, host_ctxt); +} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S new file mode 100644 index 000000000000..bb2d986ff696 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil <dbrazdil@google.com> + * + * Linker script used for partial linking of nVHE EL2 object files. + */ + +#include <asm/hyp_image.h> +#include <asm-generic/vmlinux.lds.h> +#include <asm/cache.h> +#include <asm/memory.h> + +SECTIONS { + HYP_SECTION(.text) + HYP_SECTION_NAME(.data..percpu) : { + PERCPU_INPUT(L1_CACHE_BYTES) + } +} diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8d3dd4f47924..a457a0306e03 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -27,6 +27,11 @@ #include <asm/processor.h> #include <asm/thread_info.h> +/* Non-VHE specific context */ +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) } write_sysreg(val, cptr_el2); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; @@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { + extern char __kvm_hyp_host_vector[]; u64 mdcr_el2; ___deactivate_traps(vcpu); @@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); + write_sysreg(__kvm_hyp_host_vector, vbar_el2); } -static void __deactivate_vm(struct kvm_vcpu *vcpu) +static void __load_host_stage2(void) { write_sysreg(0, vttbr_el2); } @@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmr_sync(); } - vcpu = kern_hyp_va(vcpu); - - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; @@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu)); + __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); @@ -204,7 +210,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) do { /* Jump in the fire! */ - exit_code = __guest_enter(vcpu, host_ctxt); + exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); @@ -215,7 +221,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __hyp_vgic_save_state(vcpu); __deactivate_traps(vcpu); - __deactivate_vm(vcpu); + __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); @@ -235,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQOFF); + host_ctxt->__hyp_running_vcpu = NULL; + return exit_code; } -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) +void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); - struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu; - unsigned long str_va; + bool restore_host = true; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; - if (read_sysreg(vttbr_el2)) { + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (vcpu) { __timer_disable_traps(vcpu); __deactivate_traps(vcpu); - __deactivate_vm(vcpu); + __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); } - /* - * Force the panic string to be loaded from the literal pool, - * making sure it is a kernel address and not a PC-relative - * reference. - */ - asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string)); - - __hyp_do_panic(str_va, - spsr, elr, - read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR), - read_sysreg(hpfar_el2), par, vcpu); + __hyp_do_panic(restore_host, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index b15d65a42042..39ca71ab8866 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -61,7 +61,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); /* @@ -115,7 +114,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalls12e1is); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c new file mode 100644 index 000000000000..0cdf6e461cbd --- /dev/null +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -0,0 +1,892 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. + * No bombay mix was harmed in the writing of this file. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + */ + +#include <linux/bitfield.h> +#include <asm/kvm_pgtable.h> + +#define KVM_PGTABLE_MAX_LEVELS 4U + +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_TYPE BIT(1) +#define KVM_PTE_TYPE_BLOCK 0 +#define KVM_PTE_TYPE_PAGE 1 +#define KVM_PTE_TYPE_TABLE 1 + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + +struct kvm_pgtable_walk_data { + struct kvm_pgtable *pgt; + struct kvm_pgtable_walker *walker; + + u64 addr; + u64 end; +}; + +static u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +{ + u64 granule = kvm_granule_size(level); + + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1)) + return false; + + if (granule > (end - addr)) + return false; + + return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule); +} + +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +{ + u64 shift = kvm_granule_shift(level); + u64 mask = BIT(PAGE_SHIFT - 3) - 1; + + return (data->addr >> shift) & mask; +} + +static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) +{ + u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ + u64 mask = BIT(pgt->ia_bits) - 1; + + return (addr & mask) >> shift; +} + +static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data) +{ + return __kvm_pgd_page_idx(data->pgt, data->addr); +} + +static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +{ + struct kvm_pgtable pgt = { + .ia_bits = ia_bits, + .start_level = start_level, + }; + + return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; +} + +static bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static bool kvm_pte_table(kvm_pte_t pte, u32 level) +{ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return false; + + if (!kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; +} + +static u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static kvm_pte_t kvm_phys_to_pte(u64 pa) +{ + kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + + return pte; +} + +static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte) +{ + return __va(kvm_pte_to_phys(pte)); +} + +static void kvm_set_invalid_pte(kvm_pte_t *ptep) +{ + kvm_pte_t pte = *ptep; + WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID); +} + +static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) +{ + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp)); + + pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); + pte |= KVM_PTE_VALID; + + WARN_ON(kvm_pte_valid(old)); + smp_store_release(ptep, pte); +} + +static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, + u32 level) +{ + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa); + u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; + + pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); + pte |= FIELD_PREP(KVM_PTE_TYPE, type); + pte |= KVM_PTE_VALID; + + /* Tolerate KVM recreating the exact same mapping. */ + if (kvm_pte_valid(old)) + return old == pte; + + smp_store_release(ptep, pte); + return true; +} + +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, + u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag) +{ + struct kvm_pgtable_walker *walker = data->walker; + return walker->cb(addr, data->end, level, ptep, flag, walker->arg); +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + kvm_pte_t *pgtable, u32 level); + +static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, + kvm_pte_t *ptep, u32 level) +{ + int ret = 0; + u64 addr = data->addr; + kvm_pte_t *childp, pte = *ptep; + bool table = kvm_pte_table(pte, level); + enum kvm_pgtable_walk_flags flags = data->walker->flags; + + if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_TABLE_PRE); + } + + if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_LEAF); + pte = *ptep; + table = kvm_pte_table(pte, level); + } + + if (ret) + goto out; + + if (!table) { + data->addr += kvm_granule_size(level); + goto out; + } + + childp = kvm_pte_follow(pte); + ret = __kvm_pgtable_walk(data, childp, level + 1); + if (ret) + goto out; + + if (flags & KVM_PGTABLE_WALK_TABLE_POST) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_TABLE_POST); + } + +out: + return ret; +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + kvm_pte_t *pgtable, u32 level) +{ + u32 idx; + int ret = 0; + + if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EINVAL; + + for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { + kvm_pte_t *ptep = &pgtable[idx]; + + if (data->addr >= data->end) + break; + + ret = __kvm_pgtable_visit(data, ptep, level); + if (ret) + break; + } + + return ret; +} + +static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) +{ + u32 idx; + int ret = 0; + struct kvm_pgtable *pgt = data->pgt; + u64 limit = BIT(pgt->ia_bits); + + if (data->addr > limit || data->end > limit) + return -ERANGE; + + if (!pgt->pgd) + return -EINVAL; + + for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { + kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; + + ret = __kvm_pgtable_walk(data, ptep, pgt->start_level); + if (ret) + break; + } + + return ret; +} + +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker) +{ + struct kvm_pgtable_walk_data walk_data = { + .pgt = pgt, + .addr = ALIGN_DOWN(addr, PAGE_SIZE), + .end = PAGE_ALIGN(walk_data.addr + size), + .walker = walker, + }; + + return _kvm_pgtable_walk(&walk_data); +} + +struct hyp_map_data { + u64 phys; + kvm_pte_t attr; +}; + +static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, + struct hyp_map_data *data) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; + kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; + u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : + KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; + + if (!(prot & KVM_PGTABLE_PROT_R)) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_X) { + if (prot & KVM_PGTABLE_PROT_W) + return -EINVAL; + + if (device) + return -EINVAL; + } else { + attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; + } + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + data->attr = attr; + return 0; +} + +static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, struct hyp_map_data *data) +{ + u64 granule = kvm_granule_size(level), phys = data->phys; + + if (!kvm_block_mapping_supported(addr, end, phys, level)) + return false; + + WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)); + data->phys += granule; + return true; +} + +static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + kvm_pte_t *childp; + + if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) + return 0; + + if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + if (!childp) + return -ENOMEM; + + kvm_set_table_pte(ptep, childp); + return 0; +} + +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot) +{ + int ret; + struct hyp_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + }; + struct kvm_pgtable_walker walker = { + .cb = hyp_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &map_data, + }; + + ret = hyp_map_set_prot_attr(prot, &map_data); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + isb(); + return ret; +} + +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) +{ + u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + + pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = va_bits; + pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->mmu = NULL; + return 0; +} + +static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + free_page((unsigned long)kvm_pte_follow(*ptep)); + return 0; +} + +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) +{ + struct kvm_pgtable_walker walker = { + .cb = hyp_free_walker, + .flags = KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + free_page((unsigned long)pgt->pgd); + pgt->pgd = NULL; +} + +struct stage2_map_data { + u64 phys; + kvm_pte_t attr; + + kvm_pte_t *anchor; + + struct kvm_s2_mmu *mmu; + struct kvm_mmu_memory_cache *memcache; +}; + +static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, + struct stage2_map_data *data) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) : + PAGE_S2_MEMATTR(NORMAL); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + + if (!(prot & KVM_PGTABLE_PROT_X)) + attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + else if (device) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_R) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + data->attr = attr; + return 0; +} + +static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + u64 granule = kvm_granule_size(level), phys = data->phys; + + if (!kvm_block_mapping_supported(addr, end, phys, level)) + return false; + + if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)) + goto out; + + /* There's an existing valid leaf entry, so perform break-before-make */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + kvm_set_valid_leaf_pte(ptep, phys, data->attr, level); +out: + data->phys += granule; + return true; +} + +static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + if (data->anchor) + return 0; + + if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + return 0; + + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0); + data->anchor = ptep; + return 0; +} + +static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + kvm_pte_t *childp, pte = *ptep; + struct page *page = virt_to_page(ptep); + + if (data->anchor) { + if (kvm_pte_valid(pte)) + put_page(page); + + return 0; + } + + if (stage2_map_walker_try_leaf(addr, end, level, ptep, data)) + goto out_get_page; + + if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + if (!data->memcache) + return -ENOMEM; + + childp = kvm_mmu_memory_cache_alloc(data->memcache); + if (!childp) + return -ENOMEM; + + /* + * If we've run into an existing block mapping then replace it with + * a table. Accesses beyond 'end' that fall within the new table + * will be mapped lazily. + */ + if (kvm_pte_valid(pte)) { + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + put_page(page); + } + + kvm_set_table_pte(ptep, childp); + +out_get_page: + get_page(page); + return 0; +} + +static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + int ret = 0; + + if (!data->anchor) + return 0; + + free_page((unsigned long)kvm_pte_follow(*ptep)); + put_page(virt_to_page(ptep)); + + if (data->anchor == ptep) { + data->anchor = NULL; + ret = stage2_map_walk_leaf(addr, end, level, ptep, data); + } + + return ret; +} + +/* + * This is a little fiddly, as we use all three of the walk flags. The idea + * is that the TABLE_PRE callback runs for table entries on the way down, + * looking for table entries which we could conceivably replace with a + * block entry for this mapping. If it finds one, then it sets the 'anchor' + * field in 'struct stage2_map_data' to point at the table entry, before + * clearing the entry to zero and descending into the now detached table. + * + * The behaviour of the LEAF callback then depends on whether or not the + * anchor has been set. If not, then we're not using a block mapping higher + * up the table and we perform the mapping at the existing leaves instead. + * If, on the other hand, the anchor _is_ set, then we drop references to + * all valid leaves so that the pages beneath the anchor can be freed. + * + * Finally, the TABLE_POST callback does nothing if the anchor has not + * been set, but otherwise frees the page-table pages while walking back up + * the page-table, installing the block entry when it revisits the anchor + * pointer and clearing the anchor to NULL. + */ +static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct stage2_map_data *data = arg; + + switch (flag) { + case KVM_PGTABLE_WALK_TABLE_PRE: + return stage2_map_walk_table_pre(addr, end, level, ptep, data); + case KVM_PGTABLE_WALK_LEAF: + return stage2_map_walk_leaf(addr, end, level, ptep, data); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_map_walk_table_post(addr, end, level, ptep, data); + } + + return -EINVAL; +} + +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *mc) +{ + int ret; + struct stage2_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + .mmu = pgt->mmu, + .memcache = mc, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + .arg = &map_data, + }; + + ret = stage2_map_set_prot_attr(prot, &map_data); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; +} + +static void stage2_flush_dcache(void *addr, u64 size) +{ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return; + + __flush_dcache_area(addr, size); +} + +static bool stage2_pte_cacheable(kvm_pte_t pte) +{ + u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte); + return memattr == PAGE_S2_MEMATTR(NORMAL); +} + +static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct kvm_s2_mmu *mmu = arg; + kvm_pte_t pte = *ptep, *childp = NULL; + bool need_flush = false; + + if (!kvm_pte_valid(pte)) + return 0; + + if (kvm_pte_table(pte, level)) { + childp = kvm_pte_follow(pte); + + if (page_count(virt_to_page(childp)) != 1) + return 0; + } else if (stage2_pte_cacheable(pte)) { + need_flush = true; + } + + /* + * This is similar to the map() path in that we unmap the entire + * block entry and rely on the remaining portions being faulted + * back lazily. + */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); + put_page(virt_to_page(ptep)); + + if (need_flush) { + stage2_flush_dcache(kvm_pte_follow(pte), + kvm_granule_size(level)); + } + + if (childp) + free_page((unsigned long)childp); + + return 0; +} + +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_unmap_walker, + .arg = pgt->mmu, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +struct stage2_attr_data { + kvm_pte_t attr_set; + kvm_pte_t attr_clr; + kvm_pte_t pte; + u32 level; +}; + +static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + struct stage2_attr_data *data = arg; + + if (!kvm_pte_valid(pte)) + return 0; + + data->level = level; + data->pte = pte; + pte &= ~data->attr_clr; + pte |= data->attr_set; + + /* + * We may race with the CPU trying to set the access flag here, + * but worst-case the access flag update gets lost and will be + * set on the next access instead. + */ + if (data->pte != pte) + WRITE_ONCE(*ptep, pte); + + return 0; +} + +static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, + u64 size, kvm_pte_t attr_set, + kvm_pte_t attr_clr, kvm_pte_t *orig_pte, + u32 *level) +{ + int ret; + kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; + struct stage2_attr_data data = { + .attr_set = attr_set & attr_mask, + .attr_clr = attr_clr & attr_mask, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_attr_walker, + .arg = &data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (ret) + return ret; + + if (orig_pte) + *orig_pte = data.pte; + + if (level) + *level = data.level; + return 0; +} + +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + return stage2_update_leaf_attrs(pgt, addr, size, 0, + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + NULL, NULL); +} + +kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, + &pte, NULL); + dsb(ishst); + return pte; +} + +kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, + &pte, NULL); + /* + * "But where's the TLBI?!", you scream. + * "Over in the core code", I sigh. + * + * See the '->clear_flush_young()' callback on the KVM mmu notifier. + */ + return pte; +} + +bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL); + return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; +} + +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot) +{ + int ret; + u32 level; + kvm_pte_t set = 0, clr = 0; + + if (prot & KVM_PGTABLE_PROT_R) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + if (prot & KVM_PGTABLE_PROT_X) + clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level); + if (!ret) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); + return ret; +} + +static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte)) + return 0; + + stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level)); + return 0; +} + +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_flush_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return 0; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) +{ + size_t pgd_sz; + u64 vtcr = kvm->arch.vtcr; + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; + pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = ia_bits; + pgt->start_level = start_level; + pgt->mmu = &kvm->arch.mmu; + + /* Ensure zeroed PGD pages are visible to the hardware walker */ + dsb(ishst); + return 0; +} + +static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte)) + return 0; + + put_page(virt_to_page(ptep)); + + if (kvm_pte_table(pte, level)) + free_page((unsigned long)kvm_pte_follow(pte)); + + return 0; +} + +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; + free_pages_exact(pgt->pgd, pgd_sz); + pgt->pgd = NULL; +} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index ecf67e678203..fe69de16dadc 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -28,6 +28,11 @@ const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; +/* VHE specific context */ +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) write_sysreg(val, cpacr_el1); - write_sysreg(kvm_get_hyp_vector(), vbar_el1); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } NOKPROBE_SYMBOL(__activate_traps); @@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt; u64 exit_code; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; @@ -120,12 +125,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * HCR_EL2.TGE. * * We have already configured the guest's stage 1 translation in - * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm - * before __activate_traps, because __activate_vm configures - * stage 2 translation, and __activate_traps clear HCR_EL2.TGE - * (among other things). + * kvm_vcpu_load_sysregs_vhe above. We must now call + * __load_guest_stage2 before __activate_traps, because + * __load_guest_stage2 configures stage 2 translation, and + * __activate_traps clear HCR_EL2.TGE (among other things). */ - __activate_vm(vcpu->arch.hw_mmu); + __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); @@ -133,7 +138,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) do { /* Jump in the fire! */ - exit_code = __guest_enter(vcpu, host_ctxt); + exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); @@ -188,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return ret; } -static void __hyp_call_panic(u64 spsr, u64 elr, u64 par, - struct kvm_cpu_context *host_ctxt) +static void __hyp_call_panic(u64 spsr, u64 elr, u64 par) { + struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); @@ -204,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par, } NOKPROBE_SYMBOL(__hyp_call_panic); -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) +void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); - __hyp_call_panic(spsr, elr, par, host_ctxt); + __hyp_call_panic(spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 996471e4c138..2a0b8c88d74f 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; __sysreg_save_user_state(host_ctxt); /* @@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; deactivate_traps_vhe_put(); __sysreg_save_el1_state(guest_ctxt); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index ebfdfc27b2bd..34a96ab244fa 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) /** * kvm_inject_undefined - inject an undefined instruction into the guest + * @vcpu: The vCPU in which to inject the exception * * It is assumed that this code is called from the VCPU thread and that the * VCPU therefore is not currently executing guest code. diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3d26b47a1343..19aacc7d64de 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -14,6 +14,7 @@ #include <asm/cacheflush.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> #include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> @@ -21,9 +22,7 @@ #include "trace.h" -static pgd_t *boot_hyp_pgd; -static pgd_t *hyp_pgd; -static pgd_t *merged_hyp_pgd; +static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); static unsigned long hyp_idmap_start; @@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) -#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) - -static bool is_iomap(unsigned long flags) +/* + * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, + * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, + * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too + * long will also starve other vCPUs. We have to also make sure that the page + * tables are not freed while we released the lock. + */ +static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end, + int (*fn)(struct kvm_pgtable *, u64, u64), + bool resched) { - return flags & KVM_S2PTE_FLAG_IS_IOMAP; + int ret; + u64 next; + + do { + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = stage2_pgd_addr_end(kvm, addr, end); + ret = fn(pgt, addr, next - addr); + if (ret) + break; + + if (resched && next != end) + cond_resched_lock(&kvm->mmu_lock); + } while (addr = next, addr != end); + + return ret; } +#define stage2_apply_range_resched(kvm, addr, end, fn) \ + stage2_apply_range(kvm, addr, end, fn, true) + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } -static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, - int level) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level); -} - -/* - * D-Cache management functions. They take the page table entries by - * value, as they are flushing the cache using the kernel mapping (or - * kmap on 32bit). - */ -static void kvm_flush_dcache_pte(pte_t pte) -{ - __kvm_flush_dcache_pte(pte); -} - -static void kvm_flush_dcache_pmd(pmd_t pmd) -{ - __kvm_flush_dcache_pmd(pmd); -} - -static void kvm_flush_dcache_pud(pud_t pud) -{ - __kvm_flush_dcache_pud(pud); -} - static bool kvm_is_device_pfn(unsigned long pfn) { return !pfn_valid(pfn); } -/** - * stage2_dissolve_pmd() - clear and flush huge PMD entry - * @mmu: pointer to mmu structure to operate on - * @addr: IPA - * @pmd: pmd pointer for IPA - * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd) -{ - if (!pmd_thp_or_huge(*pmd)) - return; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - put_page(virt_to_page(pmd)); -} - -/** - * stage2_dissolve_pud() - clear and flush huge PUD entry - * @mmu: pointer to mmu structure to operate on - * @addr: IPA - * @pud: pud pointer for IPA - * - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp) -{ - struct kvm *kvm = mmu->kvm; - - if (!stage2_pud_huge(kvm, *pudp)) - return; - - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - put_page(virt_to_page(pudp)); -} - -static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); - stage2_pgd_clear(kvm, pgd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_p4d_free(kvm, p4d_table); - put_page(virt_to_page(pgd)); -} - -static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); - stage2_p4d_clear(kvm, p4d); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_pud_free(kvm, pud_table); - put_page(virt_to_page(p4d)); -} - -static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); - - VM_BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_pmd_free(kvm, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - free_page((unsigned long)pte_table); - put_page(virt_to_page(pmd)); -} - -static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) -{ - WRITE_ONCE(*ptep, new_pte); - dsb(ishst); -} - -static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) -{ - WRITE_ONCE(*pmdp, new_pmd); - dsb(ishst); -} - -static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) -{ - kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); -} - -static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) -{ - WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); - dsb(ishst); -} - -static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) -{ - WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); - dsb(ishst); -} - -static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) -{ -#ifndef __PAGETABLE_P4D_FOLDED - WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); - dsb(ishst); -#endif -} - /* * Unmapping vs dcache management: * @@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) * end up writing old data to disk. * * This is why right after unmapping a page/section and invalidating - * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure - * the IO subsystem will never hit in the cache. + * the corresponding TLBs, we flush to make sure the IO subsystem will + * never hit in the cache. * * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as * we then fully enforce cacheability of RAM, no matter what the guest * does. */ -static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t start_addr = addr; - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - pte_t old_pte = *pte; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL); - - /* No need to invalidate the cache for device mappings */ - if (!kvm_is_device_pfn(pte_pfn(old_pte))) - kvm_flush_dcache_pte(old_pte); - - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (stage2_pte_table_empty(mmu->kvm, start_pte)) - clear_stage2_pmd_entry(mmu, pmd, start_addr); -} - -static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - pmd_t old_pmd = *pmd; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - - kvm_flush_dcache_pmd(old_pmd); - - put_page(virt_to_page(pmd)); - } else { - unmap_stage2_ptes(mmu, pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); - - if (stage2_pmd_table_empty(kvm, start_pmd)) - clear_stage2_pud_entry(mmu, pud, start_addr); -} - -static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - pud_t *pud, *start_pud; - - start_pud = pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - pud_t old_pud = *pud; - - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); - } else { - unmap_stage2_pmds(mmu, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); - - if (stage2_pud_table_empty(kvm, start_pud)) - clear_stage2_p4d_entry(mmu, p4d, start_addr); -} - -static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - p4d_t *p4d, *start_p4d; - - start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - unmap_stage2_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); - - if (stage2_p4d_table_empty(kvm, start_p4d)) - clear_stage2_pgd_entry(mmu, pgd, start_addr); -} - /** * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer + * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap + * @may_block: Whether or not we are permitted to block * * Clear a range of stage-2 mappings, lowering the various ref-counts. Must * be called while holding mmu_lock (unless for freeing the stage2 pgd before @@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 bool may_block) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; + phys_addr_t end = start + size; assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Make sure the page table is still active, as another thread - * could have possibly freed the page table, while we released - * the lock. - */ - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_p4ds(mmu, pgd, addr, next); - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (may_block && next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, + may_block)); } static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) @@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si __unmap_stage2_range(mmu, start, size, true); } -static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) - kvm_flush_dcache_pte(*pte); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) - kvm_flush_dcache_pmd(*pmd); - else - stage2_flush_ptes(mmu, pmd, addr, next); - } - } while (pmd++, addr = next, addr != end); -} - -static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) - kvm_flush_dcache_pud(*pud); - else - stage2_flush_pmds(mmu, pud, addr, next); - } - } while (pud++, addr = next, addr != end); -} - -static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - phys_addr_t next; - - p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - stage2_flush_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { - struct kvm_s2_mmu *mmu = &kvm->arch.mmu; phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - phys_addr_t next; - pgd_t *pgd; - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_p4ds(mmu, pgd, addr, next); - - if (next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush); } /** @@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } -static void clear_hyp_pgd_entry(pgd_t *pgd) -{ - p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); - pgd_clear(pgd); - p4d_free(NULL, p4d_table); - put_page(virt_to_page(pgd)); -} - -static void clear_hyp_p4d_entry(p4d_t *p4d) -{ - pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); - VM_BUG_ON(p4d_huge(*p4d)); - p4d_clear(p4d); - pud_free(NULL, pud_table); - put_page(virt_to_page(p4d)); -} - -static void clear_hyp_pud_entry(pud_t *pud) -{ - pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); - pmd_free(NULL, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_hyp_pmd_entry(pmd_t *pmd) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (hyp_pte_table_empty(start_pte)) - clear_hyp_pmd_entry(pmd); -} - -static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - /* Hyp doesn't use huge pmds */ - if (!pmd_none(*pmd)) - unmap_hyp_ptes(pmd, addr, next); - } while (pmd++, addr = next, addr != end); - - if (hyp_pmd_table_empty(start_pmd)) - clear_hyp_pud_entry(pud); -} - -static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pud_t *pud, *start_pud; - - start_pud = pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - /* Hyp doesn't use huge puds */ - if (!pud_none(*pud)) - unmap_hyp_pmds(pud, addr, next); - } while (pud++, addr = next, addr != end); - - if (hyp_pud_table_empty(start_pud)) - clear_hyp_p4d_entry(p4d); -} - -static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - p4d_t *p4d, *start_p4d; - - start_p4d = p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - /* Hyp doesn't use huge p4ds */ - if (!p4d_none(*p4d)) - unmap_hyp_puds(p4d, addr, next); - } while (p4d++, addr = next, addr != end); - - if (hyp_p4d_table_empty(start_p4d)) - clear_hyp_pgd_entry(pgd); -} - -static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) -{ - return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); -} - -static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, - phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - /* - * We don't unmap anything from HYP, except at the hyp tear down. - * Hence, we don't have to invalidate the TLBs here. - */ - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - do { - next = pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_hyp_p4ds(pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); -} - -static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); -} - /** * free_hyp_pgds - free Hyp-mode page tables - * - * Assumes hyp_pgd is a page table used strictly in Hyp-mode and - * therefore contains either mappings in the kernel memory area (above - * PAGE_OFFSET), or device mappings in the idmap range. - * - * boot_hyp_pgd should only map the idmap range, and is only used in - * the extended idmap case. */ void free_hyp_pgds(void) { - pgd_t *id_pgd; - mutex_lock(&kvm_hyp_pgd_mutex); - - id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; - - if (id_pgd) { - /* In case we never called hyp_mmu_init() */ - if (!io_map_base) - io_map_base = hyp_idmap_start; - unmap_hyp_idmap_range(id_pgd, io_map_base, - hyp_idmap_start + PAGE_SIZE - io_map_base); + if (hyp_pgtable) { + kvm_pgtable_hyp_destroy(hyp_pgtable); + kfree(hyp_pgtable); } - - if (boot_hyp_pgd) { - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) { - unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), - (uintptr_t)high_memory - PAGE_OFFSET); - - free_pages((unsigned long)hyp_pgd, hyp_pgd_order); - hyp_pgd = NULL; - } - if (merged_hyp_pgd) { - clear_page(merged_hyp_pgd); - free_page((unsigned long)merged_hyp_pgd); - merged_hyp_pgd = NULL; - } - mutex_unlock(&kvm_hyp_pgd_mutex); } -static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pte_t *pte; - unsigned long addr; - - addr = start; - do { - pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); - get_page(virt_to_page(pte)); - pfn++; - } while (addr += PAGE_SIZE, addr != end); -} - -static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pmd_t *pmd; - pte_t *pte; - unsigned long addr, next; - - addr = start; - do { - pmd = pmd_offset(pud, addr); - - BUG_ON(pmd_sect(*pmd)); - - if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL); - if (!pte) { - kvm_err("Cannot allocate Hyp pte\n"); - return -ENOMEM; - } - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - next = pmd_addr_end(addr, end); - - create_hyp_pte_mappings(pmd, addr, next, pfn, prot); - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pud_t *pud; - pmd_t *pmd; - unsigned long addr, next; - int ret; - - addr = start; - do { - pud = pud_offset(p4d, addr); - - if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, addr); - if (!pmd) { - kvm_err("Cannot allocate Hyp pmd\n"); - return -ENOMEM; - } - kvm_pud_populate(pud, pmd); - get_page(virt_to_page(pud)); - } - - next = pud_addr_end(addr, end); - ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) +static int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { - p4d_t *p4d; - pud_t *pud; - unsigned long addr, next; - int ret; - - addr = start; - do { - p4d = p4d_offset(pgd, addr); - - if (p4d_none(*p4d)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); - return -ENOMEM; - } - kvm_p4d_populate(p4d, pud); - get_page(virt_to_page(p4d)); - } - - next = p4d_addr_end(addr, end); - ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, - unsigned long start, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pgd_t *pgd; - p4d_t *p4d; - unsigned long addr, next; - int err = 0; + int err; mutex_lock(&kvm_hyp_pgd_mutex); - addr = start & PAGE_MASK; - end = PAGE_ALIGN(end); - do { - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - - if (pgd_none(*pgd)) { - p4d = p4d_alloc_one(NULL, addr); - if (!p4d) { - kvm_err("Cannot allocate Hyp p4d\n"); - err = -ENOMEM; - goto out; - } - kvm_pgd_populate(pgd, p4d); - get_page(virt_to_page(pgd)); - } - - next = pgd_addr_end(addr, end); - err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); - if (err) - goto out; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); -out: + err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); mutex_unlock(&kvm_hyp_pgd_mutex); + return err; } @@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. */ -int create_hyp_mappings(void *from, void *to, pgprot_t prot) +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) { phys_addr_t phys_addr; unsigned long virt_addr; @@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, - virt_addr, virt_addr + PAGE_SIZE, - __phys_to_pfn(phys_addr), + err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, prot); if (err) return err; @@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) } static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, pgprot_t prot) + unsigned long *haddr, + enum kvm_pgtable_prot prot) { - pgd_t *pgd = hyp_pgd; unsigned long base; int ret = 0; @@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, if (ret) goto out; - if (__kvm_cpu_uses_extended_idmap()) - pgd = boot_hyp_pgd; - - ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - base, base + size, - __phys_to_pfn(phys_addr), prot); + ret = __create_hyp_mappings(base, size, phys_addr, prot); if (ret) goto out; *haddr = base + offset_in_page(phys_addr); - out: return ret; } @@ -989,47 +356,48 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure * - * Allocates only the stage-2 HW PGD level table(s) of size defined by - * stage2_pgd_size(mmu->kvm). - * + * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) { - phys_addr_t pgd_phys; - pgd_t *pgd; - int cpu; + int cpu, err; + struct kvm_pgtable *pgt; - if (mmu->pgd != NULL) { + if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); - if (!pgd) + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + if (!pgt) return -ENOMEM; - pgd_phys = virt_to_phys(pgd); - if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) - return -EINVAL; + err = kvm_pgtable_stage2_init(pgt, kvm); + if (err) + goto out_free_pgtable; mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { - free_pages_exact(pgd, stage2_pgd_size(kvm)); - return -ENOMEM; + err = -ENOMEM; + goto out_destroy_pgtable; } for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; mmu->kvm = kvm; - mmu->pgd = pgd; - mmu->pgd_phys = pgd_phys; + mmu->pgt = pgt; + mmu->pgd_phys = __pa(pgt->pgd); mmu->vmid.vmid_gen = 0; - return 0; + +out_destroy_pgtable: + kvm_pgtable_stage2_destroy(pgt); +out_free_pgtable: + kfree(pgt); + return err; } static void stage2_unmap_memslot(struct kvm *kvm, @@ -1102,363 +470,21 @@ void stage2_unmap_vm(struct kvm *kvm) void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { struct kvm *kvm = mmu->kvm; - void *pgd = NULL; + struct kvm_pgtable *pgt = NULL; spin_lock(&kvm->mmu_lock); - if (mmu->pgd) { - unmap_stage2_range(mmu, 0, kvm_phys_size(kvm)); - pgd = READ_ONCE(mmu->pgd); - mmu->pgd = NULL; - } - spin_unlock(&kvm->mmu_lock); - - /* Free the HW pgd, one page at a time */ - if (pgd) { - free_pages_exact(pgd, stage2_pgd_size(kvm)); + pgt = mmu->pgt; + if (pgt) { + mmu->pgd_phys = 0; + mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } -} - -static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - p4d_t *p4d; - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - if (stage2_pgd_none(kvm, *pgd)) { - if (!cache) - return NULL; - p4d = kvm_mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, p4d); - get_page(virt_to_page(pgd)); - } - - return stage2_p4d_offset(kvm, pgd, addr); -} - -static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - pud_t *pud; - - p4d = stage2_get_p4d(mmu, cache, addr); - if (stage2_p4d_none(kvm, *p4d)) { - if (!cache) - return NULL; - pud = kvm_mmu_memory_cache_alloc(cache); - stage2_p4d_populate(kvm, p4d, pud); - get_page(virt_to_page(p4d)); - } - - return stage2_pud_offset(kvm, p4d, addr); -} - -static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - pmd_t *pmd; - - pud = stage2_get_pud(mmu, cache, addr); - if (!pud || stage2_pud_huge(kvm, *pud)) - return NULL; - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return NULL; - pmd = kvm_mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - return stage2_pmd_offset(kvm, pud, addr); -} - -static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pmd_t *new_pmd) -{ - pmd_t *pmd, old_pmd; - -retry: - pmd = stage2_get_pmd(mmu, cache, addr); - VM_BUG_ON(!pmd); - - old_pmd = *pmd; - /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. - * - * Skip updating the page table if the entry is - * unchanged. - */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - - if (pmd_present(old_pmd)) { - /* - * If we already have PTE level mapping for this block, - * we must unmap it to avoid inconsistent TLB state and - * leaking the table page. We could end up in this situation - * if the memory slot was marked for dirty logging and was - * reverted, leaving PTE level mappings for the pages accessed - * during the period. So, unmap the PTE level mapping for this - * block and retry, as we could have released the upper level - * table in the process. - * - * Normal THP split/merge follows mmu_notifier callbacks and do - * get handled accordingly. - */ - if (!pmd_thp_or_huge(old_pmd)) { - unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE); - goto retry; - } - /* - * Mapping in huge pages should only happen through a - * fault. If a page is merged into a transparent huge - * page, the individual subpages of that huge page - * should be unmapped through MMU notifiers before we - * get here. - * - * Merging of CompoundPages is not supported; they - * should become splitting first, unmapped, merged, - * and mapped back in on-demand. - */ - WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - } else { - get_page(virt_to_page(pmd)); - } - - kvm_set_pmd(pmd, *new_pmd); - return 0; -} - -static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pud_t *new_pudp) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pudp, old_pud; - -retry: - pudp = stage2_get_pud(mmu, cache, addr); - VM_BUG_ON(!pudp); - - old_pud = *pudp; - - /* - * A large number of vcpus faulting on the same stage 2 entry, - * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). - * Skip updating the page tables if there is no change. - */ - if (pud_val(old_pud) == pud_val(*new_pudp)) - return 0; - - if (stage2_pud_present(kvm, old_pud)) { - /* - * If we already have table level mapping for this block, unmap - * the range for this block and retry. - */ - if (!stage2_pud_huge(kvm, old_pud)) { - unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE); - goto retry; - } - - WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - } else { - get_page(virt_to_page(pudp)); - } - - kvm_set_pud(pudp, *new_pudp); - return 0; -} - -/* - * stage2_get_leaf_entry - walk the stage2 VM page tables and return - * true if a valid and present leaf-entry is found. A pointer to the - * leaf-entry is returned in the appropriate level variable - pudpp, - * pmdpp, ptepp. - */ -static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr, - pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - *pudpp = NULL; - *pmdpp = NULL; - *ptepp = NULL; - - pudp = stage2_get_pud(mmu, NULL, addr); - if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) - return false; - - if (stage2_pud_huge(kvm, *pudp)) { - *pudpp = pudp; - return true; - } - - pmdp = stage2_pmd_offset(kvm, pudp, addr); - if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) - return false; - - if (pmd_thp_or_huge(*pmdp)) { - *pmdpp = pmdp; - return true; - } - - ptep = pte_offset_kernel(pmdp, addr); - if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) - return false; - - *ptepp = ptep; - return true; -} - -static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - bool found; - - found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep); - if (!found) - return false; - - if (pudp) - return sz <= PUD_SIZE && kvm_s2pud_exec(pudp); - else if (pmdp) - return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp); - else - return sz == PAGE_SIZE && kvm_s2pte_exec(ptep); -} - -static int stage2_set_pte(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, - unsigned long flags) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, old_pte; - bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; - bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; - - VM_BUG_ON(logging_active && !cache); - - /* Create stage-2 page table mapping - Levels 0 and 1 */ - pud = stage2_get_pud(mmu, cache, addr); - if (!pud) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PUD, then continue - * on to allocate page. - */ - if (logging_active) - stage2_dissolve_pud(mmu, addr, pud); - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = kvm_mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - pmd = stage2_pmd_offset(kvm, pud, addr); - if (!pmd) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PMD, then continue on to - * allocate page. - */ - if (logging_active) - stage2_dissolve_pmd(mmu, addr, pmd); - - /* Create stage-2 page mappings - Level 2 */ - if (pmd_none(*pmd)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pte = kvm_mmu_memory_cache_alloc(cache); - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - pte = pte_offset_kernel(pmd, addr); - - if (iomap && pte_present(*pte)) - return -EFAULT; - - /* Create 2nd stage page table mapping - Level 3 */ - old_pte = *pte; - if (pte_present(old_pte)) { - /* Skip page table update if there is no change */ - if (pte_val(old_pte) == pte_val(*new_pte)) - return 0; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL); - } else { - get_page(virt_to_page(pte)); - } - - kvm_set_pte(pte, *new_pte); - return 0; -} + spin_unlock(&kvm->mmu_lock); -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); - return 1; + if (pgt) { + kvm_pgtable_stage2_destroy(pgt); + kfree(pgt); } - return 0; -} -#else -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - return __ptep_test_and_clear_young(pte); -} -#endif - -static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pmd); -} - -static int stage2_pudp_test_and_clear_young(pud_t *pud) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pud); } /** @@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud) * @guest_ipa: The IPA at which to insert the mapping * @pa: The physical address of the device * @size: The size of the mapping + * @writable: Whether or not to create a writable mapping */ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable) { - phys_addr_t addr, end; + phys_addr_t addr; int ret = 0; - unsigned long pfn; struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_R | + (writable ? KVM_PGTABLE_PROT_W : 0); - end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; - pfn = __phys_to_pfn(pa); - - for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); - - if (writable) - pte = kvm_s2pte_mkwrite(pte); + size += offset_in_page(guest_ipa); + guest_ipa &= PAGE_MASK; + for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { ret = kvm_mmu_topup_memory_cache(&cache, kvm_mmu_cache_min_pages(kvm)); if (ret) - goto out; + break; + spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte, - KVM_S2PTE_FLAG_IS_IOMAP); + ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, + &cache); spin_unlock(&kvm->mmu_lock); if (ret) - goto out; + break; - pfn++; + pa += PAGE_SIZE; } -out: kvm_mmu_free_memory_cache(&cache); return ret; } /** - * stage2_wp_ptes - write protect PMD range - * @pmd: pointer to pmd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - if (!kvm_s2pte_readonly(pte)) - kvm_set_s2pte_readonly(pte); - } - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -/** - * stage2_wp_pmds - write protect PUD range - * kvm: kvm instance for the VM - * @pud: pointer to pud entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - if (!kvm_s2pmd_readonly(pmd)) - kvm_set_s2pmd_readonly(pmd); - } else { - stage2_wp_ptes(pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); -} - -/** - * stage2_wp_puds - write protect P4D range - * @p4d: pointer to p4d entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - if (!kvm_s2pud_readonly(pud)) - kvm_set_s2pud_readonly(pud); - } else { - stage2_wp_pmds(mmu, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); -} - -/** - * stage2_wp_p4ds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - phys_addr_t next; - - p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - stage2_wp_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -/** * stage2_wp_range() - write protect stage2 memory region range - * @kvm: The KVM pointer + * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t next; - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Release kvm_mmu_lock periodically if the memory region is - * large. Otherwise, we may see kernel panics with - * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, - * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. We have to also make sure - * that the page tables are not freed while we released - * the lock. - */ - cond_resched_lock(&kvm->mmu_lock); - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_p4ds(mmu, pgd, addr, next); - } while (pgd++, addr = next, addr != end); + stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); } /** @@ -1833,20 +742,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { - int ret; + int ret = 0; bool write_fault, writable, force_pte = false; - bool exec_fault, needs_exec; + bool exec_fault; + bool device = false; unsigned long mmu_seq; - gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + gfn_t gfn; kvm_pfn_t pfn; - pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long vma_pagesize, flags = 0; - struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + unsigned long vma_pagesize; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); @@ -1871,31 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else vma_shift = PAGE_SHIFT; - vma_pagesize = 1ULL << vma_shift; if (logging_active || - (vma->vm_flags & VM_PFNMAP) || - !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + (vma->vm_flags & VM_PFNMAP)) { force_pte = true; - vma_pagesize = PAGE_SIZE; vma_shift = PAGE_SHIFT; } - /* - * The stage2 has a minimum of 2 level table (For arm64 see - * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can - * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). - * As for PUD huge maps, we must make sure that we have at least - * 3 levels, i.e, PMD is not folded. - */ - if (vma_pagesize == PMD_SIZE || - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) - gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + if (vma_shift == PUD_SHIFT && + !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + vma_shift = PMD_SHIFT; + + if (vma_shift == PMD_SHIFT && + !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + force_pte = true; + vma_shift = PAGE_SHIFT; + } + + vma_pagesize = 1UL << vma_shift; + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + fault_ipa &= ~(vma_pagesize - 1); + + gfn = fault_ipa >> PAGE_SHIFT; mmap_read_unlock(current->mm); - /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != FSC_PERM || (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } mmu_seq = vcpu->kvm->mmu_notifier_seq; /* @@ -1918,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; if (kvm_is_device_pfn(pfn)) { - mem_type = PAGE_S2_DEVICE; - flags |= KVM_S2PTE_FLAG_IS_IOMAP; - } else if (logging_active) { - /* - * Faults on pages in a memslot with logging enabled - * should not be mapped with huge pages (it introduces churn - * and performance degradation), so force a pte mapping. - */ - flags |= KVM_S2_FLAG_LOGGING_ACTIVE; - + device = true; + } else if (logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write * fault. */ - if (!write_fault) - writable = false; + writable = false; } - if (exec_fault && is_iomap(flags)) + if (exec_fault && device) return -ENOEXEC; spin_lock(&kvm->mmu_lock); + pgt = vcpu->arch.hw_mmu->pgt; if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; @@ -1950,67 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PAGE_SIZE && !force_pte) vma_pagesize = transparent_hugepage_adjust(memslot, hva, &pfn, &fault_ipa); - if (writable) + if (writable) { + prot |= KVM_PGTABLE_PROT_W; kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); + } - if (fault_status != FSC_PERM && !is_iomap(flags)) + if (fault_status != FSC_PERM && !device) clean_dcache_guest_page(pfn, vma_pagesize); - if (exec_fault) + if (exec_fault) { + prot |= KVM_PGTABLE_PROT_X; invalidate_icache_guest_page(pfn, vma_pagesize); + } - /* - * If we took an execution fault we have made the - * icache/dcache coherent above and should now let the s2 - * mapping be executable. - * - * Write faults (!exec_fault && FSC_PERM) are orthogonal to - * execute permissions, and we preserve whatever we have. - */ - needs_exec = exec_fault || - (fault_status == FSC_PERM && - stage2_is_exec(mmu, fault_ipa, vma_pagesize)); - - /* - * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and - * all we have is a 2-level page table. Trying to map a PUD in - * this case would be fatally wrong. - */ - if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) { - pud_t new_pud = kvm_pfn_pud(pfn, mem_type); - - new_pud = kvm_pud_mkhuge(new_pud); - if (writable) - new_pud = kvm_s2pud_mkwrite(new_pud); - - if (needs_exec) - new_pud = kvm_s2pud_mkexec(new_pud); - - ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud); - } else if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); - - new_pmd = kvm_pmd_mkhuge(new_pmd); - - if (writable) - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - - if (needs_exec) - new_pmd = kvm_s2pmd_mkexec(new_pmd); + if (device) + prot |= KVM_PGTABLE_PROT_DEVICE; + else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) + prot |= KVM_PGTABLE_PROT_X; - ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd); + if (fault_status == FSC_PERM && !(logging_active && writable)) { + ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); } else { - pte_t new_pte = kvm_pfn_pte(pfn, mem_type); - - if (writable) { - new_pte = kvm_s2pte_mkwrite(new_pte); - mark_page_dirty(kvm, gfn); - } - - if (needs_exec) - new_pte = kvm_s2pte_mkexec(new_pte); - - ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags); + ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + __pfn_to_phys(pfn), prot, + memcache); } out_unlock: @@ -2020,46 +896,23 @@ out_unlock: return ret; } -/* - * Resolve the access fault by making the page young again. - * Note that because the faulting entry is guaranteed not to be - * cached in the TLB, we don't need to invalidate anything. - * Only the HW Access Flag updates are supported for Stage 2 (no DBM), - * so there is no need for atomic (pte|pmd)_mkyoung operations. - */ +/* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - kvm_pfn_t pfn; - bool pfn_valid = false; + pte_t pte; + kvm_pte_t kpte; + struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); spin_lock(&vcpu->kvm->mmu_lock); - - if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte)) - goto out; - - if (pud) { /* HugeTLB */ - *pud = kvm_s2pud_mkyoung(*pud); - pfn = kvm_pud_pfn(*pud); - pfn_valid = true; - } else if (pmd) { /* THP, HugeTLB */ - *pmd = pmd_mkyoung(*pmd); - pfn = pmd_pfn(*pmd); - pfn_valid = true; - } else { - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; - } - -out: + mmu = vcpu->arch.hw_mmu; + kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); spin_unlock(&vcpu->kvm->mmu_lock); - if (pfn_valid) - kvm_set_pfn_accessed(pfn); + + pte = __pte(kpte); + if (pte_valid(pte)) + kvm_set_pfn_accessed(pte_pfn(pte)); } /** @@ -2230,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_unmap_hva_range(start, end); @@ -2240,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm, static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pte_t *pte = (pte_t *)data; + kvm_pfn_t *pfn = (kvm_pfn_t *)data; WARN_ON(size != PAGE_SIZE); + /* - * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE - * flag clear because MMU notifiers will have unmapped a huge PMD before - * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore stage2_set_pte() never needs to clear out a huge PMD - * through this calling path. + * The MMU notifiers will have unmapped a huge PMD before calling + * ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore we never need to clear out a huge PMD through this + * calling path and a memcache is not required. */ - stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0); + kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE, + __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL); return 0; } - int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_set_spte_hva(hva); @@ -2271,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); - + handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn); return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pte_t pte; + kvm_pte_t kpte; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return stage2_pudp_test_and_clear_young(pud); - else if (pmd) - return stage2_pmdp_test_and_clear_young(pmd); - else - return stage2_ptep_test_and_clear_young(pte); + kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa); + pte = __pte(kpte); + return pte_valid(pte) && pte_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return kvm_s2pud_young(*pud); - else if (pmd) - return pmd_young(*pmd); - else - return pte_young(*pte); + return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa); } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_age_hva(start, end); return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); @@ -2323,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_test_age_hva(hva); return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, kvm_test_age_hva_handler, NULL); } -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); -} - phys_addr_t kvm_mmu_get_httbr(void) { - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(hyp_pgd); + return __pa(hyp_pgtable->pgd); } phys_addr_t kvm_get_idmap_vector(void) @@ -2348,15 +1171,11 @@ phys_addr_t kvm_get_idmap_vector(void) return hyp_idmap_vector; } -static int kvm_map_idmap_text(pgd_t *pgd) +static int kvm_map_idmap_text(void) { - int err; - - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); + unsigned long size = hyp_idmap_end - hyp_idmap_start; + int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, + PAGE_HYP_EXEC); if (err) kvm_err("Failed to idmap %lx-%lx\n", hyp_idmap_start, hyp_idmap_end); @@ -2367,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd) int kvm_mmu_init(void) { int err; + u32 hyp_va_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -2380,6 +1200,8 @@ int kvm_mmu_init(void) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", kern_hyp_va(PAGE_OFFSET), @@ -2397,43 +1219,30 @@ int kvm_mmu_init(void) goto out; } - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - if (!hyp_pgd) { - kvm_err("Hyp mode PGD not allocated\n"); + hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); + if (!hyp_pgtable) { + kvm_err("Hyp mode page-table not allocated\n"); err = -ENOMEM; goto out; } - if (__kvm_cpu_uses_extended_idmap()) { - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - hyp_pgd_order); - if (!boot_hyp_pgd) { - kvm_err("Hyp boot PGD not allocated\n"); - err = -ENOMEM; - goto out; - } - - err = kvm_map_idmap_text(boot_hyp_pgd); - if (err) - goto out; + err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits); + if (err) + goto out_free_pgtable; - merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - if (!merged_hyp_pgd) { - kvm_err("Failed to allocate extra HYP pgd\n"); - goto out; - } - __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, - hyp_idmap_start); - } else { - err = kvm_map_idmap_text(hyp_pgd); - if (err) - goto out; - } + err = kvm_map_idmap_text(); + if (err) + goto out_destroy_pgtable; io_map_base = hyp_idmap_start; return 0; + +out_destroy_pgtable: + kvm_pgtable_hyp_destroy(hyp_pgtable); +out_free_pgtable: + kfree(hyp_pgtable); + hyp_pgtable = NULL; out: - free_hyp_pgds(); return err; } @@ -2537,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, spin_lock(&kvm->mmu_lock); if (ret) unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); - else + else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) stage2_flush_memslot(kvm, memslot); spin_unlock(&kvm->mmu_lock); out: diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 81916e360b1e..2ed5ef8f274b 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); #define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 +static u32 kvm_pmu_event_mask(struct kvm *kvm) +{ + switch (kvm->arch.pmuver) { + case 1: /* ARMv8.0 */ + return GENMASK(9, 0); + case 4: /* ARMv8.1 */ + case 5: /* ARMv8.4 */ + case 6: /* ARMv8.5 */ + return GENMASK(15, 0); + default: /* Shouldn't be here, just for sanity */ + WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver); + return 0; + } +} + /** * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter * @vcpu: The vcpu pointer @@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) return false; reg = PMEVTYPER0_EL0 + select_idx; - eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT; + eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm); return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; } @@ -516,7 +531,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) /* PMSWINC only applies to ... SW_INC! */ type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); - type &= ARMV8_PMU_EVTYPE_EVENT; + type &= kvm_pmu_event_mask(vcpu->kvm); if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) continue; @@ -599,11 +614,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) data = __vcpu_sys_reg(vcpu, reg); kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & ARMV8_PMU_EVTYPE_EVENT; + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) + eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + else + eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + + /* Software increment event doesn't need to be backed by a perf event */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR) + return; - /* Software increment event does't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && - pmc->idx != ARMV8_PMU_CYCLE_IDX) + /* + * If we have a filter in place and that the event isn't allowed, do + * not install a perf event either. + */ + if (vcpu->kvm->arch.pmu_filter && + !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; memset(&attr, 0, sizeof(struct perf_event_attr)); @@ -615,8 +640,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ - attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; + attr.config = eventsel; counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); @@ -700,17 +724,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { - u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK; + u64 reg, mask; + + mask = ARMV8_PMU_EVTYPE_MASK; + mask &= ~ARMV8_PMU_EVTYPE_EVENT; + mask |= kvm_pmu_event_mask(vcpu->kvm); reg = (select_idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; - __vcpu_sys_reg(vcpu, reg) = event_type; + __vcpu_sys_reg(vcpu, reg) = data & mask; kvm_pmu_update_pmc_chained(vcpu, select_idx); kvm_pmu_create_perf_event(vcpu, select_idx); } +static int kvm_pmu_probe_pmuver(void) +{ + struct perf_event_attr attr = { }; + struct perf_event *event; + struct arm_pmu *pmu; + int pmuver = 0xf; + + /* + * Create a dummy event that only counts user cycles. As we'll never + * leave this function with the event being live, it will never + * count anything. But it allows us to probe some of the PMU + * details. Yes, this is terrible. + */ + attr.type = PERF_TYPE_RAW; + attr.size = sizeof(attr); + attr.pinned = 1; + attr.disabled = 0; + attr.exclude_user = 0; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.exclude_host = 1; + attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + attr.sample_period = GENMASK(63, 0); + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, &attr); + + if (IS_ERR(event)) { + pr_err_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return 0xf; + } + + if (event->pmu) { + pmu = to_arm_pmu(event->pmu); + if (pmu->pmuver) + pmuver = pmu->pmuver; + } + + perf_event_disable(event); + perf_event_release_kernel(event); + + return pmuver; +} + +u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) +{ + unsigned long *bmap = vcpu->kvm->arch.pmu_filter; + u64 val, mask = 0; + int base, i; + + if (!pmceid1) { + val = read_sysreg(pmceid0_el0); + base = 0; + } else { + val = read_sysreg(pmceid1_el0); + base = 32; + } + + if (!bmap) + return val; + + for (i = 0; i < 32; i += 8) { + u64 byte; + + byte = bitmap_get_value8(bmap, base + i); + mask |= byte << i; + byte = bitmap_get_value8(bmap, 0x4000 + base + i); + mask |= byte << (32 + i); + } + + return val & mask; +} + bool kvm_arm_support_pmu_v3(void) { /* @@ -756,15 +858,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) { - if (!kvm_arm_support_pmu_v3()) - return -ENODEV; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENXIO; - - if (vcpu->arch.pmu.created) - return -EBUSY; - if (irqchip_in_kernel(vcpu->kvm)) { int ret; @@ -820,6 +913,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { + if (!kvm_arm_support_pmu_v3() || + !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (vcpu->arch.pmu.created) + return -EBUSY; + + if (!vcpu->kvm->arch.pmuver) + vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); + + if (vcpu->kvm->arch.pmuver == 0xf) + return -ENODEV; + switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; @@ -828,9 +934,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - if (get_user(irq, uaddr)) return -EFAULT; @@ -848,6 +951,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) vcpu->arch.pmu.irq_num = irq; return 0; } + case KVM_ARM_VCPU_PMU_V3_FILTER: { + struct kvm_pmu_event_filter __user *uaddr; + struct kvm_pmu_event_filter filter; + int nr_events; + + nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; + + uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (((u32)filter.base_event + filter.nevents) > nr_events || + (filter.action != KVM_PMU_EVENT_ALLOW && + filter.action != KVM_PMU_EVENT_DENY)) + return -EINVAL; + + mutex_lock(&vcpu->kvm->lock); + + if (!vcpu->kvm->arch.pmu_filter) { + vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); + if (!vcpu->kvm->arch.pmu_filter) { + mutex_unlock(&vcpu->kvm->lock); + return -ENOMEM; + } + + /* + * The default depends on the first applied filter. + * If it allows events, the default is to deny. + * Conversely, if the first filter denies a set of + * events, the default is to allow. + */ + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events); + else + bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events); + } + + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + else + bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + + mutex_unlock(&vcpu->kvm->lock); + + return 0; + } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } @@ -884,6 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: + case KVM_ARM_VCPU_PMU_V3_FILTER: if (kvm_arm_support_pmu_v3() && test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) return 0; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 3c224162b3dd..faf32a44ba04 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) */ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { - struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); - if (!kvm_pmu_switch_needed(attr)) + if (!ctx || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) */ void kvm_clr_pmu_events(u32 clr) { - struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + + if (!ctx) + return; ctx->pmu_events.events_host &= ~clr; ctx->pmu_events.events_guest &= ~clr; @@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) return; preempt_disable(); - host = this_cpu_ptr(&kvm_host_data); + host = this_cpu_ptr_hyp_sym(kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; @@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) if (!has_vhe()) return; - host = this_cpu_ptr(&kvm_host_data); + host = this_cpu_ptr_hyp_sym(kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f6e8b4a75cbb..f32490229a4c 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -335,7 +335,7 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int ipa_max, pa_max, va_max, parange, tgran_2; + unsigned int parange, tgran_2; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); @@ -372,39 +372,11 @@ int kvm_set_ipa_limit(void) break; } - pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); - - /* Clamp the IPA limit to the PA size supported by the kernel */ - ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; - /* - * Since our stage2 table is dependent on the stage1 page table code, - * we must always honor the following condition: - * - * Number of levels in Stage1 >= Number of levels in Stage2. - * - * So clamp the ipa limit further down to limit the number of levels. - * Since we can concatenate upto 16 tables at entry level, we could - * go upto 4bits above the maximum VA addressable with the current - * number of levels. - */ - va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; - va_max += 4; - - if (va_max < ipa_max) - ipa_max = va_max; - - /* - * If the final limit is lower than the real physical address - * limit of the CPUs, report the reason. - */ - if (ipa_max < pa_max) - pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", - (va_max < pa_max) ? "Virtual" : "Physical"); - - WARN(ipa_max < KVM_PHYS_SHIFT, - "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); - kvm_ipa_limit = ipa_max; - kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); + kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); + WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, + "KVM IPA Size Limit (%d bits) is smaller than default size\n", + kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9ca270603980..d9117bc56237 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - if (!(p->Op2 & 1)) - pmceid = read_sysreg(pmceid0_el0); - else - pmceid = read_sysreg(pmceid1_el0); + pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); p->regval = pmceid; diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index b13a9e3f99dd..f38c40a76251 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v) return 0; } -static const struct seq_operations vgic_debug_seq_ops = { +static const struct seq_operations vgic_debug_sops = { .start = vgic_debug_start, .next = vgic_debug_next, .stop = vgic_debug_stop, .show = vgic_debug_show }; -static int debug_open(struct inode *inode, struct file *file) -{ - int ret; - ret = seq_open(file, &vgic_debug_seq_ops); - if (!ret) { - struct seq_file *seq; - /* seq_open will have modified file->private_data */ - seq = file->private_data; - seq->private = inode->i_private; - } - - return ret; -}; - -static const struct file_operations vgic_debug_fops = { - .owner = THIS_MODULE, - .open = debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; +DEFINE_SEQ_ATTRIBUTE(vgic_debug); void vgic_debug_init(struct kvm *kvm) { diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 76e2d85789ed..9cdf39a94a63 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) if (likely(cpu_if->vgic_sre)) kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if)); + kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) vgic_v3_vmcr_sync(vcpu); - kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if)); + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6c45350e33aa..93e87b287556 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -6,7 +6,7 @@ #include <linux/gfp.h> #include <linux/cache.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/dma-iommu.h> #include <xen/xen.h> #include <xen/swiotlb-xen.h> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f0bf86d81622..095540667f0f 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -21,8 +21,7 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/dma-direct.h> -#include <linux/dma-mapping.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/efi.h> #include <linux/swiotlb.h> #include <linux/vmalloc.h> @@ -54,12 +53,6 @@ s64 memstart_addr __ro_after_init = -1; EXPORT_SYMBOL(memstart_addr); -s64 physvirt_offset __ro_after_init; -EXPORT_SYMBOL(physvirt_offset); - -struct page *vmemmap __ro_after_init; -EXPORT_SYMBOL(vmemmap); - /* * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of * memory as some devices, namely the Raspberry Pi 4, have peripherals with @@ -290,20 +283,6 @@ void __init arm64_memblock_init(void) memstart_addr = round_down(memblock_start_of_DRAM(), ARM64_MEMSTART_ALIGN); - physvirt_offset = PHYS_OFFSET - PAGE_OFFSET; - - vmemmap = ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)); - - /* - * If we are running with a 52-bit kernel VA config on a system that - * does not support it, we have to offset our vmemmap and physvirt_offset - * s.t. we avoid the 52-bit portion of the direct linear map - */ - if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52)) { - vmemmap += (_PAGE_OFFSET(48) - _PAGE_OFFSET(52)) >> PAGE_SHIFT; - physvirt_offset = PHYS_OFFSET - _PAGE_OFFSET(48); - } - /* * Remove the memory that we will not be able to cover with the * linear mapping. Take care not to clip the kernel which may be @@ -319,6 +298,16 @@ void __init arm64_memblock_init(void) } /* + * If we are running with a 52-bit kernel VA config on a system that + * does not support it, we have to place the available physical + * memory in the 48-bit addressable part of the linear region, i.e., + * we have to move it upward. Since memstart_addr represents the + * physical address of PAGE_OFFSET, we have to *subtract* from it. + */ + if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52)) + memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52); + + /* * Apply the memory limit if it was set. Since the kernel may be loaded * high up in memory, add back the kernel region that must be accessible * via the linear mapping. @@ -429,6 +418,8 @@ void __init bootmem_init(void) arm64_hugetlb_cma_reserve(); #endif + dma_pernuma_cma_reserve(); + /* * sparse_init() tries to allocate memory from memblock, so must be * done after the fixed reservations diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 6444ebfd06a6..48d66bf0465d 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -22,6 +22,7 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select MMU_GATHER_NO_RANGE if MMU + select SET_FS config MMU def_bool n diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index d05c78eace1b..a3f15b9a79da 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -316,8 +316,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags, if (thread_info_flags & (1 << TIF_SIGPENDING)) do_signal(regs, syscall); - if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - } } diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c index a5909091cb14..03df07a831fc 100644 --- a/arch/c6x/mm/dma-coherent.c +++ b/arch/c6x/mm/dma-coherent.c @@ -15,7 +15,7 @@ #include <linux/bitops.h> #include <linux/module.h> #include <linux/interrupt.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/memblock.h> #include <asm/cacheflush.h> diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 7f424c85772c..268fad5f51cf 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -78,6 +78,7 @@ config CSKY select PCI_DOMAINS_GENERIC if PCI select PCI_SYSCALL if PCI select PCI_MSI if PCI + select SET_FS config LOCKDEP_SUPPORT def_bool y diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 0481f4e34538..e4cab16056d6 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -7,7 +7,7 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/start_kernel.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/screen_info.h> #include <asm/sections.h> #include <asm/mmu_context.h> diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index 970895df75ec..8b068cf37447 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -261,7 +261,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 8f6571ae27c8..c3a775a7e8f9 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -2,9 +2,7 @@ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #include <linux/cache.h> -#include <linux/dma-mapping.h> -#include <linux/dma-contiguous.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/genalloc.h> #include <linux/highmem.h> #include <linux/io.h> diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index d11666d538fe..7945de067e9f 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -25,6 +25,7 @@ config H8300 select HAVE_ARCH_KGDB select HAVE_ARCH_HASH select CPU_NO_EFFICIENT_FFS + select SET_FS select UACCESS_MEMCPY config CPU_BIG_ENDIAN diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 83ce3caf7313..aea0a40b77a9 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -172,5 +172,5 @@ asmlinkage int sys_clone(unsigned long __user *args) kargs.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL); kargs.stack = newsp; - return _do_fork(&kargs); + return kernel_clone(&kargs); } diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 69e68949787f..75d9b7e626b2 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -282,8 +282,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 667cfc511cf9..f2afabbadd43 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,6 +31,7 @@ config HEXAGON select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES + select SET_FS help Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c index 25f388d9cfcc..00b9a81075dd 100644 --- a/arch/hexagon/kernel/dma.c +++ b/arch/hexagon/kernel/dma.c @@ -5,7 +5,7 @@ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. */ -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/memblock.h> #include <linux/genalloc.h> #include <linux/module.h> diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index dfd322c5ce83..5a0a95d93ddb 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -180,7 +180,6 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) } if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); return 1; } diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 996c410f2152..39b25a5a591b 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -8,6 +8,7 @@ menu "Processor type and features" config IA64 bool + select ARCH_HAS_DMA_MARK_CLEAN select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ACPI @@ -32,8 +33,6 @@ config IA64 select TTY select HAVE_ARCH_TRACEHOOK select HAVE_VIRT_CPU_ACCOUNTING - select DMA_NONCOHERENT_MMAP - select ARCH_HAS_SYNC_DMA_FOR_CPU select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP @@ -57,6 +56,7 @@ config IA64 select NEED_SG_DMA_LENGTH select NUMA if !FLATMEM select PCI_MSI_ARCH_FALLBACKS if PCI_MSI + select SET_FS default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 2876a7df1b0a..703b1c4f6d12 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -20,7 +20,6 @@ CHECKFLAGS += -D__ia64=1 -D__ia64__=1 -D_LP64 -D__LP64__ OBJCOPYFLAGS := --strip-all LDFLAGS_vmlinux := -static -KBUILD_LDS_MODULE += $(srctree)/arch/ia64/module.lds KBUILD_AFLAGS_KERNEL := -mconstant-gp EXTRA := diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 656a4888c300..9148ddbf02e5 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -33,7 +33,7 @@ #include <linux/bitops.h> /* hweight64() */ #include <linux/crash_dump.h> #include <linux/iommu-helper.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/prefetch.h> #include <linux/swiotlb.h> @@ -485,8 +485,7 @@ sba_search_bitmap(struct ioc *ioc, struct device *dev, ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0); ASSERT(res_ptr < res_end); - boundary_size = (unsigned long long)dma_get_seg_boundary(dev) + 1; - boundary_size = ALIGN(boundary_size, 1ULL << iovp_shift) >> iovp_shift; + boundary_size = dma_get_seg_boundary_nr_pages(dev, iovp_shift); BUG_ON(ioc->ibase & ~iovp_mask); shift = ioc->ibase >> iovp_shift; @@ -2071,6 +2070,8 @@ static const struct dma_map_ops sba_dma_ops = { .dma_supported = sba_dma_supported, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; static int __init diff --git a/arch/ia64/module.lds b/arch/ia64/include/asm/module.lds.h index eff68f362793..eff68f362793 100644 --- a/arch/ia64/module.lds +++ b/arch/ia64/include/asm/module.lds.h diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 81901c5e5426..c89bd5f8cbf8 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -40,7 +40,7 @@ obj-y += esi_stub.o # must be in kernel proper endif obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_ELF_CORE) += elfcore.o # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 09ef9ce9988d..cd0c166bfbc2 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/export.h> /* Set this to 1 if there is a HW IOMMU in the system */ @@ -7,15 +7,3 @@ int iommu_detected __read_mostly; const struct dma_map_ops *dma_ops; EXPORT_SYMBOL(dma_ops); - -void *arch_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) -{ - return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); -} - -void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) -{ - dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); -} diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index e74e10f19fff..6b61a703bcf5 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -176,7 +176,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) ia64_do_signal(scr, in_syscall); } - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + if (test_thread_flag(TIF_NOTIFY_RESUME)) { local_irq_enable(); /* force interrupt enable */ tracehook_notify_resume(&scr->pt); } @@ -271,7 +271,7 @@ ia64_load_extra (struct task_struct *task) * * <clone syscall> <some kernel call frames> * sys_clone : - * _do_fork _do_fork + * kernel_clone kernel_clone * copy_thread copy_thread * * This means that the stack layout is as follows: @@ -411,7 +411,7 @@ asmlinkage long ia64_clone(unsigned long clone_flags, unsigned long stack_start, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } static void diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 4799c96c325f..b96ed8b8a508 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -360,3 +360,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 8e7b8c6c576e..ef12e097f318 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/init.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/dmar.h> #include <linux/efi.h> #include <linux/elf.h> @@ -73,8 +73,7 @@ __ia64_sync_icache_dcache (pte_t pte) * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ -void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, - enum dma_data_direction dir) +void arch_dma_mark_clean(phys_addr_t paddr, size_t size) { unsigned long pfn = PHYS_PFN(paddr); @@ -538,7 +537,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } @@ -548,7 +547,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone, { if (!vmem_map) { memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 93bbb74ea876..372e4e69c43a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -31,6 +31,8 @@ config M68K select NO_DMA if !MMU && !COLDFIRE select OLD_SIGACTION select OLD_SIGSUSPEND3 + select SET_FS + select UACCESS_MEMCPY if !MMU select VIRT_TO_BUS config CPU_BIG_ENDIAN diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 4438ffb4bbe1..ea14f2046fb4 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -75,7 +75,6 @@ KBUILD_CPPFLAGS += -D__uClinux__ endif KBUILD_LDFLAGS := -m m68kelf -KBUILD_LDS_MODULE += $(srctree)/arch/m68k/kernel/module.lds ifdef CONFIG_SUN3 LDFLAGS_vmlinux = -N diff --git a/arch/m68k/coldfire/device.c b/arch/m68k/coldfire/device.c index 9ef4ec0aea00..59f7dfe50a4d 100644 --- a/arch/m68k/coldfire/device.c +++ b/arch/m68k/coldfire/device.c @@ -554,7 +554,7 @@ static struct platform_device mcf_edma = { }; #endif /* IS_ENABLED(CONFIG_MCF_EDMA) */ -#if IS_ENABLED(CONFIG_MMC) +#ifdef MCFSDHC_BASE static struct mcf_esdhc_platform_data mcf_esdhc_data = { .max_bus_width = 4, .cd_type = ESDHC_CD_NONE, @@ -579,7 +579,7 @@ static struct platform_device mcf_esdhc = { .resource = mcf_esdhc_resources, .dev.platform_data = &mcf_esdhc_data, }; -#endif /* IS_ENABLED(CONFIG_MMC) */ +#endif /* MCFSDHC_BASE */ static struct platform_device *mcf_devices[] __initdata = { &mcf_uart, @@ -613,7 +613,7 @@ static struct platform_device *mcf_devices[] __initdata = { #if IS_ENABLED(CONFIG_MCF_EDMA) &mcf_edma, #endif -#if IS_ENABLED(CONFIG_MMC) +#ifdef MCFSDHC_BASE &mcf_esdhc, #endif }; diff --git a/arch/m68k/kernel/module.lds b/arch/m68k/include/asm/module.lds.h index fda94fa38243..fda94fa38243 100644 --- a/arch/m68k/kernel/module.lds +++ b/arch/m68k/include/asm/module.lds.h diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index e896466a41a4..f98208ccbbcd 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -1,7 +1,397 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef __uClinux__ -#include <asm/uaccess_no.h> +#ifndef __M68K_UACCESS_H +#define __M68K_UACCESS_H + +#ifdef CONFIG_MMU + +/* + * User space memory access functions + */ +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/segment.h> +#include <asm/extable.h> + +/* We let the MMU do all checking */ +static inline int access_ok(const void __user *addr, + unsigned long size) +{ + return 1; +} + +/* + * Not all varients of the 68k family support the notion of address spaces. + * The traditional 680x0 parts do, and they use the sfc/dfc registers and + * the "moves" instruction to access user space from kernel space. Other + * family members like ColdFire don't support this, and only have a single + * address space, and use the usual "move" instruction for user space access. + * + * Outside of this difference the user space access functions are the same. + * So lets keep the code simple and just define in what we need to use. + */ +#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES +#define MOVES "moves" #else -#include <asm/uaccess_mm.h> +#define MOVES "move" #endif -#include <asm/extable.h> + +extern int __put_user_bad(void); +extern int __get_user_bad(void); + +#define __put_user_asm(res, x, ptr, bwl, reg, err) \ +asm volatile ("\n" \ + "1: "MOVES"."#bwl" %2,%1\n" \ + "2:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: moveq.l %3,%0\n" \ + " jra 2b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .previous" \ + : "+d" (res), "=m" (*(ptr)) \ + : #reg (x), "i" (err)) + +/* + * These are the main single-value transfer routines. They automatically + * use the right size if we just have the right pointer type. + */ + +#define __put_user(x, ptr) \ +({ \ + typeof(*(ptr)) __pu_val = (x); \ + int __pu_err = 0; \ + __chk_user_ptr(ptr); \ + switch (sizeof (*(ptr))) { \ + case 1: \ + __put_user_asm(__pu_err, __pu_val, ptr, b, d, -EFAULT); \ + break; \ + case 2: \ + __put_user_asm(__pu_err, __pu_val, ptr, w, r, -EFAULT); \ + break; \ + case 4: \ + __put_user_asm(__pu_err, __pu_val, ptr, l, r, -EFAULT); \ + break; \ + case 8: \ + { \ + const void __user *__pu_ptr = (ptr); \ + asm volatile ("\n" \ + "1: "MOVES".l %2,(%1)+\n" \ + "2: "MOVES".l %R2,(%1)\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: movel %3,%0\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .long 3b,10b\n" \ + " .previous" \ + : "+d" (__pu_err), "+a" (__pu_ptr) \ + : "r" (__pu_val), "i" (-EFAULT) \ + : "memory"); \ + break; \ + } \ + default: \ + __pu_err = __put_user_bad(); \ + break; \ + } \ + __pu_err; \ +}) +#define put_user(x, ptr) __put_user(x, ptr) + + +#define __get_user_asm(res, x, ptr, type, bwl, reg, err) ({ \ + type __gu_val; \ + asm volatile ("\n" \ + "1: "MOVES"."#bwl" %2,%1\n" \ + "2:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: move.l %3,%0\n" \ + " sub.l %1,%1\n" \ + " jra 2b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .previous" \ + : "+d" (res), "=&" #reg (__gu_val) \ + : "m" (*(ptr)), "i" (err)); \ + (x) = (__force typeof(*(ptr)))(__force unsigned long)__gu_val; \ +}) + +#define __get_user(x, ptr) \ +({ \ + int __gu_err = 0; \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: \ + __get_user_asm(__gu_err, x, ptr, u8, b, d, -EFAULT); \ + break; \ + case 2: \ + __get_user_asm(__gu_err, x, ptr, u16, w, r, -EFAULT); \ + break; \ + case 4: \ + __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ + break; \ + case 8: { \ + const void __user *__gu_ptr = (ptr); \ + union { \ + u64 l; \ + __typeof__(*(ptr)) t; \ + } __gu_val; \ + asm volatile ("\n" \ + "1: "MOVES".l (%2)+,%1\n" \ + "2: "MOVES".l (%2),%R1\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: move.l %3,%0\n" \ + " sub.l %1,%1\n" \ + " sub.l %R1,%R1\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .previous" \ + : "+d" (__gu_err), "=&r" (__gu_val.l), \ + "+a" (__gu_ptr) \ + : "i" (-EFAULT) \ + : "memory"); \ + (x) = __gu_val.t; \ + break; \ + } \ + default: \ + __gu_err = __get_user_bad(); \ + break; \ + } \ + __gu_err; \ +}) +#define get_user(x, ptr) __get_user(x, ptr) + +unsigned long __generic_copy_from_user(void *to, const void __user *from, unsigned long n); +unsigned long __generic_copy_to_user(void __user *to, const void *from, unsigned long n); + +#define __suffix0 +#define __suffix1 b +#define __suffix2 w +#define __suffix4 l + +#define ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\ + asm volatile ("\n" \ + "1: "MOVES"."#s1" (%2)+,%3\n" \ + " move."#s1" %3,(%1)+\n" \ + " .ifnc \""#s2"\",\"\"\n" \ + "2: "MOVES"."#s2" (%2)+,%3\n" \ + " move."#s2" %3,(%1)+\n" \ + " .ifnc \""#s3"\",\"\"\n" \ + "3: "MOVES"."#s3" (%2)+,%3\n" \ + " move."#s3" %3,(%1)+\n" \ + " .endif\n" \ + " .endif\n" \ + "4:\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10f\n" \ + " .ifnc \""#s2"\",\"\"\n" \ + " .long 2b,20f\n" \ + " .ifnc \""#s3"\",\"\"\n" \ + " .long 3b,30f\n" \ + " .endif\n" \ + " .endif\n" \ + " .previous\n" \ + "\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: addq.l #"#n1",%0\n" \ + " .ifnc \""#s2"\",\"\"\n" \ + "20: addq.l #"#n2",%0\n" \ + " .ifnc \""#s3"\",\"\"\n" \ + "30: addq.l #"#n3",%0\n" \ + " .endif\n" \ + " .endif\n" \ + " jra 4b\n" \ + " .previous\n" \ + : "+d" (res), "+&a" (to), "+a" (from), "=&d" (tmp) \ + : : "memory") + +#define ___constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\ + ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3) +#define __constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3) \ + ___constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, \ + __suffix##n1, __suffix##n2, __suffix##n3) + +static __always_inline unsigned long +__constant_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + unsigned long res = 0, tmp; + + switch (n) { + case 1: + __constant_copy_from_user_asm(res, to, from, tmp, 1, 0, 0); + break; + case 2: + __constant_copy_from_user_asm(res, to, from, tmp, 2, 0, 0); + break; + case 3: + __constant_copy_from_user_asm(res, to, from, tmp, 2, 1, 0); + break; + case 4: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 0, 0); + break; + case 5: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 1, 0); + break; + case 6: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 2, 0); + break; + case 7: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 2, 1); + break; + case 8: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 0); + break; + case 9: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 1); + break; + case 10: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 2); + break; + case 12: + __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 4); + break; + default: + /* we limit the inlined version to 3 moves */ + return __generic_copy_from_user(to, from, n); + } + + return res; +} + +#define __constant_copy_to_user_asm(res, to, from, tmp, n, s1, s2, s3) \ + asm volatile ("\n" \ + " move."#s1" (%2)+,%3\n" \ + "11: "MOVES"."#s1" %3,(%1)+\n" \ + "12: move."#s2" (%2)+,%3\n" \ + "21: "MOVES"."#s2" %3,(%1)+\n" \ + "22:\n" \ + " .ifnc \""#s3"\",\"\"\n" \ + " move."#s3" (%2)+,%3\n" \ + "31: "MOVES"."#s3" %3,(%1)+\n" \ + "32:\n" \ + " .endif\n" \ + "4:\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 11b,5f\n" \ + " .long 12b,5f\n" \ + " .long 21b,5f\n" \ + " .long 22b,5f\n" \ + " .ifnc \""#s3"\",\"\"\n" \ + " .long 31b,5f\n" \ + " .long 32b,5f\n" \ + " .endif\n" \ + " .previous\n" \ + "\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "5: moveq.l #"#n",%0\n" \ + " jra 4b\n" \ + " .previous\n" \ + : "+d" (res), "+a" (to), "+a" (from), "=&d" (tmp) \ + : : "memory") + +static __always_inline unsigned long +__constant_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + unsigned long res = 0, tmp; + + switch (n) { + case 1: + __put_user_asm(res, *(u8 *)from, (u8 __user *)to, b, d, 1); + break; + case 2: + __put_user_asm(res, *(u16 *)from, (u16 __user *)to, w, r, 2); + break; + case 3: + __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,); + break; + case 4: + __put_user_asm(res, *(u32 *)from, (u32 __user *)to, l, r, 4); + break; + case 5: + __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,); + break; + case 6: + __constant_copy_to_user_asm(res, to, from, tmp, 6, l, w,); + break; + case 7: + __constant_copy_to_user_asm(res, to, from, tmp, 7, l, w, b); + break; + case 8: + __constant_copy_to_user_asm(res, to, from, tmp, 8, l, l,); + break; + case 9: + __constant_copy_to_user_asm(res, to, from, tmp, 9, l, l, b); + break; + case 10: + __constant_copy_to_user_asm(res, to, from, tmp, 10, l, l, w); + break; + case 12: + __constant_copy_to_user_asm(res, to, from, tmp, 12, l, l, l); + break; + default: + /* limit the inlined version to 3 moves */ + return __generic_copy_to_user(to, from, n); + } + + return res; +} + +static inline unsigned long +raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + if (__builtin_constant_p(n)) + return __constant_copy_from_user(to, from, n); + return __generic_copy_from_user(to, from, n); +} + +static inline unsigned long +raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + if (__builtin_constant_p(n)) + return __constant_copy_to_user(to, from, n); + return __generic_copy_to_user(to, from, n); +} +#define INLINE_COPY_FROM_USER +#define INLINE_COPY_TO_USER + +#define user_addr_max() \ + (uaccess_kernel() ? ~0UL : TASK_SIZE) + +extern long strncpy_from_user(char *dst, const char __user *src, long count); +extern __must_check long strnlen_user(const char __user *str, long n); + +unsigned long __clear_user(void __user *to, unsigned long n); + +#define clear_user __clear_user + +#else /* !CONFIG_MMU */ +#include <asm-generic/uaccess.h> +#endif + +#endif /* _M68K_UACCESS_H */ diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h deleted file mode 100644 index 9ae9f8d05925..000000000000 --- a/arch/m68k/include/asm/uaccess_mm.h +++ /dev/null @@ -1,390 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __M68K_UACCESS_H -#define __M68K_UACCESS_H - -/* - * User space memory access functions - */ -#include <linux/compiler.h> -#include <linux/types.h> -#include <asm/segment.h> - -/* We let the MMU do all checking */ -static inline int access_ok(const void __user *addr, - unsigned long size) -{ - return 1; -} - -/* - * Not all varients of the 68k family support the notion of address spaces. - * The traditional 680x0 parts do, and they use the sfc/dfc registers and - * the "moves" instruction to access user space from kernel space. Other - * family members like ColdFire don't support this, and only have a single - * address space, and use the usual "move" instruction for user space access. - * - * Outside of this difference the user space access functions are the same. - * So lets keep the code simple and just define in what we need to use. - */ -#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES -#define MOVES "moves" -#else -#define MOVES "move" -#endif - -extern int __put_user_bad(void); -extern int __get_user_bad(void); - -#define __put_user_asm(res, x, ptr, bwl, reg, err) \ -asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ - "2:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: moveq.l %3,%0\n" \ - " jra 2b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .previous" \ - : "+d" (res), "=m" (*(ptr)) \ - : #reg (x), "i" (err)) - -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - */ - -#define __put_user(x, ptr) \ -({ \ - typeof(*(ptr)) __pu_val = (x); \ - int __pu_err = 0; \ - __chk_user_ptr(ptr); \ - switch (sizeof (*(ptr))) { \ - case 1: \ - __put_user_asm(__pu_err, __pu_val, ptr, b, d, -EFAULT); \ - break; \ - case 2: \ - __put_user_asm(__pu_err, __pu_val, ptr, w, r, -EFAULT); \ - break; \ - case 4: \ - __put_user_asm(__pu_err, __pu_val, ptr, l, r, -EFAULT); \ - break; \ - case 8: \ - { \ - const void __user *__pu_ptr = (ptr); \ - asm volatile ("\n" \ - "1: "MOVES".l %2,(%1)+\n" \ - "2: "MOVES".l %R2,(%1)\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: movel %3,%0\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .long 3b,10b\n" \ - " .previous" \ - : "+d" (__pu_err), "+a" (__pu_ptr) \ - : "r" (__pu_val), "i" (-EFAULT) \ - : "memory"); \ - break; \ - } \ - default: \ - __pu_err = __put_user_bad(); \ - break; \ - } \ - __pu_err; \ -}) -#define put_user(x, ptr) __put_user(x, ptr) - - -#define __get_user_asm(res, x, ptr, type, bwl, reg, err) ({ \ - type __gu_val; \ - asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ - "2:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: move.l %3,%0\n" \ - " sub.l %1,%1\n" \ - " jra 2b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .previous" \ - : "+d" (res), "=&" #reg (__gu_val) \ - : "m" (*(ptr)), "i" (err)); \ - (x) = (__force typeof(*(ptr)))(__force unsigned long)__gu_val; \ -}) - -#define __get_user(x, ptr) \ -({ \ - int __gu_err = 0; \ - __chk_user_ptr(ptr); \ - switch (sizeof(*(ptr))) { \ - case 1: \ - __get_user_asm(__gu_err, x, ptr, u8, b, d, -EFAULT); \ - break; \ - case 2: \ - __get_user_asm(__gu_err, x, ptr, u16, w, r, -EFAULT); \ - break; \ - case 4: \ - __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ - break; \ - case 8: { \ - const void __user *__gu_ptr = (ptr); \ - union { \ - u64 l; \ - __typeof__(*(ptr)) t; \ - } __gu_val; \ - asm volatile ("\n" \ - "1: "MOVES".l (%2)+,%1\n" \ - "2: "MOVES".l (%2),%R1\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: move.l %3,%0\n" \ - " sub.l %1,%1\n" \ - " sub.l %R1,%R1\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .previous" \ - : "+d" (__gu_err), "=&r" (__gu_val.l), \ - "+a" (__gu_ptr) \ - : "i" (-EFAULT) \ - : "memory"); \ - (x) = __gu_val.t; \ - break; \ - } \ - default: \ - __gu_err = __get_user_bad(); \ - break; \ - } \ - __gu_err; \ -}) -#define get_user(x, ptr) __get_user(x, ptr) - -unsigned long __generic_copy_from_user(void *to, const void __user *from, unsigned long n); -unsigned long __generic_copy_to_user(void __user *to, const void *from, unsigned long n); - -#define __suffix0 -#define __suffix1 b -#define __suffix2 w -#define __suffix4 l - -#define ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\ - asm volatile ("\n" \ - "1: "MOVES"."#s1" (%2)+,%3\n" \ - " move."#s1" %3,(%1)+\n" \ - " .ifnc \""#s2"\",\"\"\n" \ - "2: "MOVES"."#s2" (%2)+,%3\n" \ - " move."#s2" %3,(%1)+\n" \ - " .ifnc \""#s3"\",\"\"\n" \ - "3: "MOVES"."#s3" (%2)+,%3\n" \ - " move."#s3" %3,(%1)+\n" \ - " .endif\n" \ - " .endif\n" \ - "4:\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10f\n" \ - " .ifnc \""#s2"\",\"\"\n" \ - " .long 2b,20f\n" \ - " .ifnc \""#s3"\",\"\"\n" \ - " .long 3b,30f\n" \ - " .endif\n" \ - " .endif\n" \ - " .previous\n" \ - "\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: addq.l #"#n1",%0\n" \ - " .ifnc \""#s2"\",\"\"\n" \ - "20: addq.l #"#n2",%0\n" \ - " .ifnc \""#s3"\",\"\"\n" \ - "30: addq.l #"#n3",%0\n" \ - " .endif\n" \ - " .endif\n" \ - " jra 4b\n" \ - " .previous\n" \ - : "+d" (res), "+&a" (to), "+a" (from), "=&d" (tmp) \ - : : "memory") - -#define ___constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\ - ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3) -#define __constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3) \ - ___constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, \ - __suffix##n1, __suffix##n2, __suffix##n3) - -static __always_inline unsigned long -__constant_copy_from_user(void *to, const void __user *from, unsigned long n) -{ - unsigned long res = 0, tmp; - - switch (n) { - case 1: - __constant_copy_from_user_asm(res, to, from, tmp, 1, 0, 0); - break; - case 2: - __constant_copy_from_user_asm(res, to, from, tmp, 2, 0, 0); - break; - case 3: - __constant_copy_from_user_asm(res, to, from, tmp, 2, 1, 0); - break; - case 4: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 0, 0); - break; - case 5: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 1, 0); - break; - case 6: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 2, 0); - break; - case 7: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 2, 1); - break; - case 8: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 0); - break; - case 9: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 1); - break; - case 10: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 2); - break; - case 12: - __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 4); - break; - default: - /* we limit the inlined version to 3 moves */ - return __generic_copy_from_user(to, from, n); - } - - return res; -} - -#define __constant_copy_to_user_asm(res, to, from, tmp, n, s1, s2, s3) \ - asm volatile ("\n" \ - " move."#s1" (%2)+,%3\n" \ - "11: "MOVES"."#s1" %3,(%1)+\n" \ - "12: move."#s2" (%2)+,%3\n" \ - "21: "MOVES"."#s2" %3,(%1)+\n" \ - "22:\n" \ - " .ifnc \""#s3"\",\"\"\n" \ - " move."#s3" (%2)+,%3\n" \ - "31: "MOVES"."#s3" %3,(%1)+\n" \ - "32:\n" \ - " .endif\n" \ - "4:\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 11b,5f\n" \ - " .long 12b,5f\n" \ - " .long 21b,5f\n" \ - " .long 22b,5f\n" \ - " .ifnc \""#s3"\",\"\"\n" \ - " .long 31b,5f\n" \ - " .long 32b,5f\n" \ - " .endif\n" \ - " .previous\n" \ - "\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "5: moveq.l #"#n",%0\n" \ - " jra 4b\n" \ - " .previous\n" \ - : "+d" (res), "+a" (to), "+a" (from), "=&d" (tmp) \ - : : "memory") - -static __always_inline unsigned long -__constant_copy_to_user(void __user *to, const void *from, unsigned long n) -{ - unsigned long res = 0, tmp; - - switch (n) { - case 1: - __put_user_asm(res, *(u8 *)from, (u8 __user *)to, b, d, 1); - break; - case 2: - __put_user_asm(res, *(u16 *)from, (u16 __user *)to, w, r, 2); - break; - case 3: - __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,); - break; - case 4: - __put_user_asm(res, *(u32 *)from, (u32 __user *)to, l, r, 4); - break; - case 5: - __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,); - break; - case 6: - __constant_copy_to_user_asm(res, to, from, tmp, 6, l, w,); - break; - case 7: - __constant_copy_to_user_asm(res, to, from, tmp, 7, l, w, b); - break; - case 8: - __constant_copy_to_user_asm(res, to, from, tmp, 8, l, l,); - break; - case 9: - __constant_copy_to_user_asm(res, to, from, tmp, 9, l, l, b); - break; - case 10: - __constant_copy_to_user_asm(res, to, from, tmp, 10, l, l, w); - break; - case 12: - __constant_copy_to_user_asm(res, to, from, tmp, 12, l, l, l); - break; - default: - /* limit the inlined version to 3 moves */ - return __generic_copy_to_user(to, from, n); - } - - return res; -} - -static inline unsigned long -raw_copy_from_user(void *to, const void __user *from, unsigned long n) -{ - if (__builtin_constant_p(n)) - return __constant_copy_from_user(to, from, n); - return __generic_copy_from_user(to, from, n); -} - -static inline unsigned long -raw_copy_to_user(void __user *to, const void *from, unsigned long n) -{ - if (__builtin_constant_p(n)) - return __constant_copy_to_user(to, from, n); - return __generic_copy_to_user(to, from, n); -} -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER - -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) - -extern long strncpy_from_user(char *dst, const char __user *src, long count); -extern __must_check long strnlen_user(const char __user *str, long n); - -unsigned long __clear_user(void __user *to, unsigned long n); - -#define clear_user __clear_user - -#endif /* _M68K_UACCESS_H */ diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h deleted file mode 100644 index dcfb69361408..000000000000 --- a/arch/m68k/include/asm/uaccess_no.h +++ /dev/null @@ -1,160 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __M68KNOMMU_UACCESS_H -#define __M68KNOMMU_UACCESS_H - -/* - * User space memory access functions - */ -#include <linux/string.h> - -#include <asm/segment.h> - -#define access_ok(addr,size) _access_ok((unsigned long)(addr),(size)) - -/* - * It is not enough to just have access_ok check for a real RAM address. - * This would disallow the case of code/ro-data running XIP in flash/rom. - * Ideally we would check the possible flash ranges too, but that is - * currently not so easy. - */ -static inline int _access_ok(unsigned long addr, unsigned long size) -{ - return 1; -} - -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - */ - -#define put_user(x, ptr) \ -({ \ - int __pu_err = 0; \ - typeof(*(ptr)) __pu_val = (x); \ - switch (sizeof (*(ptr))) { \ - case 1: \ - __put_user_asm(__pu_err, __pu_val, ptr, b); \ - break; \ - case 2: \ - __put_user_asm(__pu_err, __pu_val, ptr, w); \ - break; \ - case 4: \ - __put_user_asm(__pu_err, __pu_val, ptr, l); \ - break; \ - case 8: \ - memcpy((void __force *)ptr, &__pu_val, sizeof(*(ptr))); \ - break; \ - default: \ - __pu_err = __put_user_bad(); \ - break; \ - } \ - __pu_err; \ -}) -#define __put_user(x, ptr) put_user(x, ptr) - -extern int __put_user_bad(void); - -/* - * Tell gcc we read from memory instead of writing: this is because - * we do not write to any memory gcc knows about, so there are no - * aliasing issues. - */ - -#define __ptr(x) ((unsigned long __user *)(x)) - -#define __put_user_asm(err,x,ptr,bwl) \ - __asm__ ("move" #bwl " %0,%1" \ - : /* no outputs */ \ - :"d" (x),"m" (*__ptr(ptr)) : "memory") - -#define get_user(x, ptr) \ -({ \ - int __gu_err = 0; \ - switch (sizeof(*(ptr))) { \ - case 1: \ - __get_user_asm(__gu_err, x, ptr, b, "=d"); \ - break; \ - case 2: \ - __get_user_asm(__gu_err, x, ptr, w, "=r"); \ - break; \ - case 4: \ - __get_user_asm(__gu_err, x, ptr, l, "=r"); \ - break; \ - case 8: { \ - union { \ - u64 l; \ - __typeof__(*(ptr)) t; \ - } __gu_val; \ - memcpy(&__gu_val.l, (const void __force *)ptr, sizeof(__gu_val.l)); \ - (x) = __gu_val.t; \ - break; \ - } \ - default: \ - __gu_err = __get_user_bad(); \ - break; \ - } \ - __gu_err; \ -}) -#define __get_user(x, ptr) get_user(x, ptr) - -extern int __get_user_bad(void); - -#define __get_user_asm(err,x,ptr,bwl,reg) \ - __asm__ ("move" #bwl " %1,%0" \ - : "=d" (x) \ - : "m" (*__ptr(ptr))) - -static inline unsigned long -raw_copy_from_user(void *to, const void __user *from, unsigned long n) -{ - memcpy(to, (__force const void *)from, n); - return 0; -} - -static inline unsigned long -raw_copy_to_user(void __user *to, const void *from, unsigned long n) -{ - memcpy((__force void *)to, from, n); - return 0; -} -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER - -/* - * Copy a null terminated string from userspace. - */ - -static inline long -strncpy_from_user(char *dst, const char *src, long count) -{ - char *tmp; - strncpy(dst, src, count); - for (tmp = dst; *tmp && count > 0; tmp++, count--) - ; - return(tmp - dst); /* DAVIDM should we count a NUL ? check getname */ -} - -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ -static inline long strnlen_user(const char *src, long n) -{ - return(strlen(src) + 1); /* DAVIDM make safer */ -} - -/* - * Zero Userspace - */ - -static inline unsigned long -__clear_user(void *to, unsigned long n) -{ - memset(to, 0, n); - return 0; -} - -#define clear_user(to,n) __clear_user(to,n) - -#endif /* _M68KNOMMU_UACCESS_H */ diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index b1ca3522eccc..1c1b875fadc1 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c @@ -6,7 +6,7 @@ #undef DEBUG -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/device.h> #include <linux/kernel.h> #include <linux/platform_device.h> diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 6492a2c54dbc..08359a6e058f 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -107,10 +107,10 @@ void flush_thread(void) * on top of pt_regs, which means that sys_clone() arguments would be * buried. We could, of course, copy them, but it's too costly for no * good reason - generic clone() would have to copy them *again* for - * _do_fork() anyway. So in this case it's actually better to pass pt_regs * - * and extract arguments for _do_fork() from there. Eventually we might - * go for calling _do_fork() directly from the wrapper, but only after we - * are finished with _do_fork() prototype conversion. + * kernel_clone() anyway. So in this case it's actually better to pass pt_regs * + * and extract arguments for kernel_clone() from there. Eventually we might + * go for calling kernel_clone() directly from the wrapper, but only after we + * are finished with kernel_clone() prototype conversion. */ asmlinkage int m68k_clone(struct pt_regs *regs) { @@ -125,7 +125,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) .tls = regs->d5, }; - return _do_fork(&args); + return kernel_clone(&args); } /* diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index a98fca977073..46f91e0f6a08 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -920,7 +920,8 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, err |= __put_user(0x70004e40 + (__NR_sigreturn << 16), (long __user *)(frame->retcode)); #else - err |= __put_user((void *) ret_from_user_signal, &frame->pretcode); + err |= __put_user((long) ret_from_user_signal, + (long __user *) &frame->pretcode); #endif if (err) @@ -1004,7 +1005,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, err |= __put_user(0x4e40, (short __user *)(frame->retcode + 4)); #endif #else - err |= __put_user((void *) ret_from_user_rt_signal, &frame->pretcode); + err |= __put_user((long) ret_from_user_rt_signal, + (long __user *) &frame->pretcode); #endif /* CONFIG_MMU */ if (err) @@ -1134,6 +1136,6 @@ void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 81fc799d8392..625fb6d32842 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 37bd6a5f38fb..33925ffed68f 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -47,6 +47,7 @@ config MICROBLAZE select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE if MMU select SPARSE_IRQ + select SET_FS # Endianness selection choice diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index d7bebd04247b..04d091ade417 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c @@ -8,9 +8,8 @@ */ #include <linux/device.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/gfp.h> -#include <linux/dma-debug.h> #include <linux/export.h> #include <linux/bug.h> #include <asm/cacheflush.h> diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 4a96b59f0bee..f11a0ccccabc 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -316,6 +316,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs, in_syscall); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index b4e263916f41..aae729c95cf9 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -445,3 +445,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index e09b66e43cb6..81dffe43b18c 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -11,7 +11,7 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/init.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <asm/cpuinfo.h> #include <asm/cacheflush.h> diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 0902c459c385..45da639bd22c 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -7,7 +7,7 @@ * for more details. */ -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/memblock.h> #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index a13c4cf6e608..5483e38b5dc7 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -13,7 +13,6 @@ platform-$(CONFIG_MIPS_COBALT) += cobalt/ platform-$(CONFIG_MACH_DECSTATION) += dec/ platform-$(CONFIG_MIPS_GENERIC) += generic/ platform-$(CONFIG_MACH_JAZZ) += jazz/ -platform-$(CONFIG_MACH_INGENIC) += jz4740/ platform-$(CONFIG_LANTIQ) += lantiq/ platform-$(CONFIG_MACH_LOONGSON2EF) += loongson2ef/ platform-$(CONFIG_MACH_LOONGSON32) += loongson32/ @@ -22,7 +21,6 @@ platform-$(CONFIG_MIPS_MALTA) += mti-malta/ platform-$(CONFIG_NLM_COMMON) += netlogic/ platform-$(CONFIG_PIC32MZDA) += pic32/ platform-$(CONFIG_MACH_PISTACHIO) += pistachio/ -platform-$(CONFIG_SOC_PNX833X) += pnx833x/ platform-$(CONFIG_RALINK) += ralink/ platform-$(CONFIG_MIKROTIK_RB532) += rb532/ platform-$(CONFIG_SGI_IP22) += sgi-ip22/ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 440614dc9de2..2000bb2b0220 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -88,20 +88,41 @@ config MIPS select PERF_USE_VMALLOC select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select RTC_LIB + select SET_FS select SYSCTL_EXCEPTION_TRACE select VIRT_TO_BUS config MIPS_FIXUP_BIGPHYS_ADDR bool +config MIPS_GENERIC + bool + +config MACH_INGENIC + bool + select SYS_SUPPORTS_32BIT_KERNEL + select SYS_SUPPORTS_LITTLE_ENDIAN + select SYS_SUPPORTS_ZBOOT + select DMA_NONCOHERENT + select IRQ_MIPS_CPU + select PINCTRL + select GPIOLIB + select COMMON_CLK + select GENERIC_IRQ_CHIP + select BUILTIN_DTB if MIPS_NO_APPENDED_DTB + select USE_OF + select CPU_SUPPORTS_CPUFREQ + select MIPS_EXTERNAL_TIMER + menu "Machine selection" choice prompt "System type" - default MIPS_GENERIC + default MIPS_GENERIC_KERNEL -config MIPS_GENERIC +config MIPS_GENERIC_KERNEL bool "Generic board-agnostic MIPS kernel" + select MIPS_GENERIC select BOOT_RAW select BUILTIN_DTB select CEVT_R4K @@ -138,6 +159,7 @@ config MIPS_GENERIC select SYS_SUPPORTS_MULTITHREADING select SYS_SUPPORTS_RELOCATABLE select SYS_SUPPORTS_SMARTMIPS + select SYS_SUPPORTS_ZBOOT select UHI_BOOT select USB_EHCI_BIG_ENDIAN_DESC if CPU_BIG_ENDIAN select USB_EHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN @@ -390,20 +412,11 @@ config MACH_JAZZ Members include the Acer PICA, MIPS Magnum 4000, MIPS Millennium and Olivetti M700-10 workstations. -config MACH_INGENIC +config MACH_INGENIC_SOC bool "Ingenic SoC based machines" - select SYS_SUPPORTS_32BIT_KERNEL - select SYS_SUPPORTS_LITTLE_ENDIAN + select MIPS_GENERIC + select MACH_INGENIC select SYS_SUPPORTS_ZBOOT_UART16550 - select CPU_SUPPORTS_HUGEPAGES - select DMA_NONCOHERENT - select IRQ_MIPS_CPU - select PINCTRL - select GPIOLIB - select COMMON_CLK - select GENERIC_IRQ_CHIP - select BUILTIN_DTB if MIPS_NO_APPENDED_DTB - select USE_OF config LANTIQ bool "Lantiq based platforms" @@ -476,6 +489,7 @@ config MACH_LOONGSON64 select SYS_SUPPORTS_ZBOOT select ZONE_DMA32 select NUMA + select SMP select COMMON_CLK select USE_OF select BUILTIN_DTB @@ -569,6 +583,7 @@ config MIPS_MALTA select SYS_SUPPORTS_VPE_LOADER select SYS_SUPPORTS_ZBOOT select USE_OF + select WAR_ICACHE_REFILLS select ZONE_DMA32 if 64BIT help This enables support for the MIPS Technologies Malta evaluation @@ -590,19 +605,6 @@ config MACH_VR41XX select SYS_SUPPORTS_MIPS16 select GPIOLIB -config NXP_STB220 - bool "NXP STB220 board" - select SOC_PNX833X - help - Support for NXP Semiconductors STB220 Development Board. - -config NXP_STB225 - bool "NXP 225 board" - select SOC_PNX833X - select SOC_PNX8335 - help - Support for NXP Semiconductors STB225 Development Board. - config RALINK bool "Ralink based machines" select CEVT_R4K @@ -616,6 +618,7 @@ config RALINK select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_LITTLE_ENDIAN select SYS_SUPPORTS_MIPS16 + select SYS_SUPPORTS_ZBOOT select SYS_HAS_EARLY_PRINTK select CLKDEV_LOOKUP select ARCH_HAS_RESET_CONTROLLER @@ -652,6 +655,9 @@ config SGI_IP22 select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN + select WAR_R4600_V1_INDEX_ICACHEOP + select WAR_R4600_V1_HIT_CACHEOP + select WAR_R4600_V2_HIT_CACHEOP select MIPS_L1_CACHE_SHIFT_7 help This are the SGI Indy, Challenge S and Indigo2, as well as certain @@ -679,6 +685,7 @@ config SGI_IP27 select SYS_SUPPORTS_BIG_ENDIAN select SYS_SUPPORTS_NUMA select SYS_SUPPORTS_SMP + select WAR_R10000_LLSC select MIPS_L1_CACHE_SHIFT_7 select NUMA help @@ -714,6 +721,7 @@ config SGI_IP28 select SYS_HAS_EARLY_PRINTK select SYS_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN + select WAR_R10000_LLSC select MIPS_L1_CACHE_SHIFT_7 help This is the SGI Indigo2 with R10000 processor. To compile a Linux @@ -740,6 +748,7 @@ config SGI_IP30 select SYS_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN select SYS_SUPPORTS_SMP + select WAR_R10000_LLSC select MIPS_L1_CACHE_SHIFT_7 select ARC_MEMORY help @@ -767,6 +776,7 @@ config SGI_IP32 select SYS_HAS_CPU_NEVADA select SYS_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN + select WAR_ICACHE_REFILLS help If you want this kernel to run on SGI O2 workstation, say Y here. @@ -890,6 +900,7 @@ config SNI_RM select SYS_SUPPORTS_BIG_ENDIAN select SYS_SUPPORTS_HIGHMEM select SYS_SUPPORTS_LITTLE_ENDIAN + select WAR_R4600_V2_HIT_CACHEOP help The SNI RM200/300/400 are MIPS-based machines manufactured by Siemens Nixdorf Informationssysteme (SNI), parent company of Pyramid @@ -901,6 +912,7 @@ config MACH_TX39XX config MACH_TX49XX bool "Toshiba TX49 series based machines" + select WAR_TX49XX_ICACHE_INDEX_INV config MIKROTIK_RB532 bool "Mikrotik RB532 boards" @@ -1026,8 +1038,8 @@ source "arch/mips/bcm47xx/Kconfig" source "arch/mips/bcm63xx/Kconfig" source "arch/mips/bmips/Kconfig" source "arch/mips/generic/Kconfig" +source "arch/mips/ingenic/Kconfig" source "arch/mips/jazz/Kconfig" -source "arch/mips/jz4740/Kconfig" source "arch/mips/lantiq/Kconfig" source "arch/mips/pic32/Kconfig" source "arch/mips/pistachio/Kconfig" @@ -1136,7 +1148,6 @@ config DMA_NONCOHERENT select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_DMA_SET_UNCACHED select DMA_NONCOHERENT_MMAP - select DMA_NONCOHERENT_CACHE_SYNC select NEED_DMA_MAP_STATE config SYS_HAS_EARLY_PRINTK @@ -1268,23 +1279,6 @@ config PCI_XTALK_BRIDGE config NO_EXCEPT_FILL bool -config SOC_PNX833X - bool - select CEVT_R4K - select CSRC_R4K - select IRQ_MIPS_CPU - select DMA_NONCOHERENT - select SYS_HAS_CPU_MIPS32_R2 - select SYS_SUPPORTS_32BIT_KERNEL - select SYS_SUPPORTS_LITTLE_ENDIAN - select SYS_SUPPORTS_BIG_ENDIAN - select SYS_SUPPORTS_MIPS16 - select CPU_MIPSR2_IRQ_VI - -config SOC_PNX8335 - bool - select SOC_PNX833X - config MIPS_SPRAM bool @@ -1621,7 +1615,6 @@ config CPU_P5600 select CPU_SUPPORTS_32BIT_KERNEL select CPU_SUPPORTS_HIGHMEM select CPU_SUPPORTS_MSA - select CPU_SUPPORTS_UNCACHED_ACCELERATED select CPU_SUPPORTS_CPUFREQ select CPU_MIPSR2_IRQ_VI select CPU_MIPSR2_IRQ_EI @@ -1892,6 +1885,7 @@ config SYS_SUPPORTS_ZBOOT select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ + select HAVE_KERNEL_ZSTD config SYS_SUPPORTS_ZBOOT_UART16550 bool @@ -2273,7 +2267,7 @@ config FORCE_MAX_ZONEORDER default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB range 12 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB - range 11 64 + range 0 64 default "11" help The kernel memory allocator divides physically contiguous memory @@ -2639,6 +2633,76 @@ config MIPS_ASID_BITS_VARIABLE config MIPS_CRC_SUPPORT bool +# R4600 erratum. Due to the lack of errata information the exact +# technical details aren't known. I've experimentally found that disabling +# interrupts during indexed I-cache flushes seems to be sufficient to deal +# with the issue. +config WAR_R4600_V1_INDEX_ICACHEOP + bool + +# Pleasures of the R4600 V1.x. Cite from the IDT R4600 V1.7 errata: +# +# 18. The CACHE instructions Hit_Writeback_Invalidate_D, Hit_Writeback_D, +# Hit_Invalidate_D and Create_Dirty_Excl_D should only be +# executed if there is no other dcache activity. If the dcache is +# accessed for another instruction immeidately preceding when these +# cache instructions are executing, it is possible that the dcache +# tag match outputs used by these cache instructions will be +# incorrect. These cache instructions should be preceded by at least +# four instructions that are not any kind of load or store +# instruction. +# +# This is not allowed: lw +# nop +# nop +# nop +# cache Hit_Writeback_Invalidate_D +# +# This is allowed: lw +# nop +# nop +# nop +# nop +# cache Hit_Writeback_Invalidate_D +config WAR_R4600_V1_HIT_CACHEOP + bool + +# Writeback and invalidate the primary cache dcache before DMA. +# +# R4600 v2.0 bug: "The CACHE instructions Hit_Writeback_Inv_D, +# Hit_Writeback_D, Hit_Invalidate_D and Create_Dirty_Exclusive_D will only +# operate correctly if the internal data cache refill buffer is empty. These +# CACHE instructions should be separated from any potential data cache miss +# by a load instruction to an uncached address to empty the response buffer." +# (Revision 2.0 device errata from IDT available on https://www.idt.com/ +# in .pdf format.) +config WAR_R4600_V2_HIT_CACHEOP + bool + +# From TX49/H2 manual: "If the instruction (i.e. CACHE) is issued for +# the line which this instruction itself exists, the following +# operation is not guaranteed." +# +# Workaround: do two phase flushing for Index_Invalidate_I +config WAR_TX49XX_ICACHE_INDEX_INV + bool + +# The RM7000 processors and the E9000 cores have a bug (though PMC-Sierra +# opposes it being called that) where invalid instructions in the same +# I-cache line worth of instructions being fetched may case spurious +# exceptions. +config WAR_ICACHE_REFILLS + bool + +# On the R10000 up to version 2.6 (not sure about 2.7) there is a bug that +# may cause ll / sc and lld / scd sequences to execute non-atomically. +config WAR_R10000_LLSC + bool + +# 34K core erratum: "Problems Executing the TLBR Instruction" +config WAR_MIPS34K_MISSED_ITLB + bool + # # - Highmem only makes sense for the 32-bit kernel. # - The current highmem code will only work properly on physically indexed diff --git a/arch/mips/alchemy/Kconfig b/arch/mips/alchemy/Kconfig index 83b288b95b16..69734120ada1 100644 --- a/arch/mips/alchemy/Kconfig +++ b/arch/mips/alchemy/Kconfig @@ -1,12 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# au1000-style gpio and interrupt controllers -config ALCHEMY_GPIOINT_AU1000 - bool - -# au1300-style GPIO/INT controller -config ALCHEMY_GPIOINT_AU1300 - bool - choice prompt "Machine type" depends on MIPS_ALCHEMY @@ -15,7 +7,6 @@ choice config MIPS_MTX1 bool "4G Systems MTX-1 board" select HAVE_PCI - select ALCHEMY_GPIOINT_AU1000 select SYS_SUPPORTS_LITTLE_ENDIAN select SYS_HAS_EARLY_PRINTK @@ -33,13 +24,11 @@ config MIPS_DB1XXX config MIPS_XXS1500 bool "MyCable XXS1500 board" - select ALCHEMY_GPIOINT_AU1000 select SYS_SUPPORTS_LITTLE_ENDIAN select SYS_HAS_EARLY_PRINTK config MIPS_GPR bool "Trapeze ITS GPR board" - select ALCHEMY_GPIOINT_AU1000 select HAVE_PCI select SYS_SUPPORTS_LITTLE_ENDIAN select SYS_HAS_EARLY_PRINTK diff --git a/arch/mips/alchemy/board-gpr.c b/arch/mips/alchemy/board-gpr.c index 6c47318946e4..f587c40b6d00 100644 --- a/arch/mips/alchemy/board-gpr.c +++ b/arch/mips/alchemy/board-gpr.c @@ -31,23 +31,6 @@ const char *get_system_type(void) return "GPR"; } -void __init prom_init(void) -{ - unsigned char *memsize_str; - unsigned long memsize; - - prom_argc = fw_arg0; - prom_argv = (char **)fw_arg1; - prom_envp = (char **)fw_arg2; - - prom_init_cmdline(); - - memsize_str = prom_getenv("memsize"); - if (!memsize_str || kstrtoul(memsize_str, 0, &memsize)) - memsize = 0x04000000; - add_memory_region(0, memsize, BOOT_MEM_RAM); -} - void prom_putchar(char c) { alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); diff --git a/arch/mips/alchemy/board-mtx1.c b/arch/mips/alchemy/board-mtx1.c index 23093535399f..68ea57511629 100644 --- a/arch/mips/alchemy/board-mtx1.c +++ b/arch/mips/alchemy/board-mtx1.c @@ -30,23 +30,6 @@ const char *get_system_type(void) return "MTX-1"; } -void __init prom_init(void) -{ - unsigned char *memsize_str; - unsigned long memsize; - - prom_argc = fw_arg0; - prom_argv = (char **)fw_arg1; - prom_envp = (char **)fw_arg2; - - prom_init_cmdline(); - - memsize_str = prom_getenv("memsize"); - if (!memsize_str || kstrtoul(memsize_str, 0, &memsize)) - memsize = 0x04000000; - add_memory_region(0, memsize, BOOT_MEM_RAM); -} - void prom_putchar(char c) { alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); diff --git a/arch/mips/alchemy/board-xxs1500.c b/arch/mips/alchemy/board-xxs1500.c index c67dfe1f4997..b184baa4e56a 100644 --- a/arch/mips/alchemy/board-xxs1500.c +++ b/arch/mips/alchemy/board-xxs1500.c @@ -25,24 +25,6 @@ const char *get_system_type(void) return "XXS1500"; } -void __init prom_init(void) -{ - unsigned char *memsize_str; - unsigned long memsize; - - prom_argc = fw_arg0; - prom_argv = (char **)fw_arg1; - prom_envp = (char **)fw_arg2; - - prom_init_cmdline(); - - memsize_str = prom_getenv("memsize"); - if (!memsize_str || kstrtoul(memsize_str, 0, &memsize)) - memsize = 0x04000000; - - add_memory_region(0, memsize, BOOT_MEM_RAM); -} - void prom_putchar(char c) { alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); diff --git a/arch/mips/alchemy/common/prom.c b/arch/mips/alchemy/common/prom.c index af312b5e33f6..d910c0a64de9 100644 --- a/arch/mips/alchemy/common/prom.c +++ b/arch/mips/alchemy/common/prom.c @@ -34,6 +34,9 @@ */ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/sizes.h> #include <linux/string.h> #include <asm/bootinfo.h> @@ -76,6 +79,24 @@ char *prom_getenv(char *envname) return NULL; } +void __init prom_init(void) +{ + unsigned char *memsize_str; + unsigned long memsize; + + prom_argc = (int)fw_arg0; + prom_argv = (char **)fw_arg1; + prom_envp = (char **)fw_arg2; + + prom_init_cmdline(); + + memsize_str = prom_getenv("memsize"); + if (!memsize_str || kstrtoul(memsize_str, 0, &memsize)) + memsize = SZ_64M; /* minimum memsize is 64MB RAM */ + + memblock_add(0, memsize); +} + static inline unsigned char str2hexnum(unsigned char c) { if (c >= '0' && c <= '9') diff --git a/arch/mips/alchemy/devboards/db1300.c b/arch/mips/alchemy/devboards/db1300.c index 8ac1f56ee57d..cd72eaa1168f 100644 --- a/arch/mips/alchemy/devboards/db1300.c +++ b/arch/mips/alchemy/devboards/db1300.c @@ -731,6 +731,7 @@ static struct platform_device db1300_lcd_dev = { /**********************************************************************/ +#if IS_ENABLED(CONFIG_TOUCHSCREEN_WM97XX) static void db1300_wm97xx_irqen(struct wm97xx *wm, int enable) { if (enable) @@ -762,6 +763,12 @@ static int db1300_wm97xx_probe(struct platform_device *pdev) return wm97xx_register_mach_ops(wm, &db1300_wm97xx_ops); } +#else +static int db1300_wm97xx_probe(struct platform_device *pdev) +{ + return -ENODEV; +} +#endif static struct platform_driver db1300_wm97xx_driver = { .driver.name = "wm97xx-touch", diff --git a/arch/mips/alchemy/devboards/platform.c b/arch/mips/alchemy/devboards/platform.c index 8d4b65c3268a..754bdd2ca630 100644 --- a/arch/mips/alchemy/devboards/platform.c +++ b/arch/mips/alchemy/devboards/platform.c @@ -20,23 +20,6 @@ #include <prom.h> -void __init prom_init(void) -{ - unsigned char *memsize_str; - unsigned long memsize; - - prom_argc = (int)fw_arg0; - prom_argv = (char **)fw_arg1; - prom_envp = (char **)fw_arg2; - - prom_init_cmdline(); - memsize_str = prom_getenv("memsize"); - if (!memsize_str || kstrtoul(memsize_str, 0, &memsize)) - memsize = 64 << 20; /* all devboards have at least 64MB RAM */ - - add_memory_region(0, memsize, BOOT_MEM_RAM); -} - void prom_putchar(char c) { if (alchemy_get_cputype() == ALCHEMY_CPU_AU1300) diff --git a/arch/mips/ar7/memory.c b/arch/mips/ar7/memory.c index ad6efb36ebfe..787716c5e946 100644 --- a/arch/mips/ar7/memory.c +++ b/arch/mips/ar7/memory.c @@ -47,7 +47,7 @@ void __init prom_meminit(void) unsigned long pages; pages = memsize() >> PAGE_SHIFT; - add_memory_region(PHYS_OFFSET, pages << PAGE_SHIFT, BOOT_MEM_RAM); + memblock_add(PHYS_OFFSET, pages << PAGE_SHIFT); } void __init prom_free_prom_memory(void) diff --git a/arch/mips/ath25/ar2315.c b/arch/mips/ath25/ar2315.c index e7b53e3960c8..9dbed7b5ea76 100644 --- a/arch/mips/ath25/ar2315.c +++ b/arch/mips/ath25/ar2315.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/irqdomain.h> #include <linux/interrupt.h> +#include <linux/memblock.h> #include <linux/platform_device.h> #include <linux/reboot.h> #include <asm/bootinfo.h> @@ -266,7 +267,7 @@ void __init ar2315_plat_mem_setup(void) memsize <<= 1 + ATH25_REG_MS(memcfg, AR2315_MEM_CFG_COL_WIDTH); memsize <<= 1 + ATH25_REG_MS(memcfg, AR2315_MEM_CFG_ROW_WIDTH); memsize <<= 3; - add_memory_region(0, memsize, BOOT_MEM_RAM); + memblock_add(0, memsize); iounmap(sdram_base); ar2315_rst_base = ioremap(AR2315_RST_BASE, AR2315_RST_SIZE); diff --git a/arch/mips/ath25/ar5312.c b/arch/mips/ath25/ar5312.c index 42bf2afb4765..23c879f4b734 100644 --- a/arch/mips/ath25/ar5312.c +++ b/arch/mips/ath25/ar5312.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/irqdomain.h> #include <linux/interrupt.h> +#include <linux/memblock.h> #include <linux/platform_device.h> #include <linux/mtd/physmap.h> #include <linux/reboot.h> @@ -363,7 +364,7 @@ void __init ar5312_plat_mem_setup(void) memsize = (bank0_ac ? (1 << (bank0_ac + 1)) : 0) + (bank1_ac ? (1 << (bank1_ac + 1)) : 0); memsize <<= 20; - add_memory_region(0, memsize, BOOT_MEM_RAM); + memblock_add(0, memsize); iounmap(sdram_base); ar5312_rst_base = ioremap(AR5312_RST_BASE, AR5312_RST_SIZE); diff --git a/arch/mips/bcm47xx/prom.c b/arch/mips/bcm47xx/prom.c index 135a5407f015..3e2a8166377f 100644 --- a/arch/mips/bcm47xx/prom.c +++ b/arch/mips/bcm47xx/prom.c @@ -27,6 +27,7 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/memblock.h> #include <linux/spinlock.h> #include <linux/ssb/ssb_driver_chipcommon.h> #include <linux/ssb/ssb_regs.h> @@ -97,7 +98,7 @@ static __init void prom_init_mem(void) */ if (c->cputype == CPU_74K && (mem == (128 << 20))) mem -= 0x1000; - add_memory_region(0, mem, BOOT_MEM_RAM); + memblock_add(0, mem); } /* diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 01427bde2397..94bf839576c1 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -141,7 +141,7 @@ static void __init bcm47xx_register_bcma(void) /* * Memory setup is done in the early part of MIPS's arch_mem_init. It's supposed - * to detect memory and record it with add_memory_region. + * to detect memory and record it with memblock_add. * Any extra initializaion performed here must not use kmalloc or bootmem. */ void __init plat_mem_setup(void) diff --git a/arch/mips/bcm63xx/boards/board_bcm963xx.c b/arch/mips/bcm63xx/boards/board_bcm963xx.c index 230bf27c1fb8..01aff80a5967 100644 --- a/arch/mips/bcm63xx/boards/board_bcm963xx.c +++ b/arch/mips/bcm63xx/boards/board_bcm963xx.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * * Copyright (C) 2008 Maxime Bizon <mbizon@freebox.fr> * Copyright (C) 2008 Florian Fainelli <florian@openwrt.org> */ @@ -32,7 +29,6 @@ #include <uapi/linux/bcm933xx_hcs.h> - #define HCS_OFFSET_128K 0x20000 static struct board_info board; @@ -42,30 +38,28 @@ static struct board_info board; */ #ifdef CONFIG_BCM63XX_CPU_3368 static struct board_info __initdata board_cvg834g = { - .name = "CVG834G_E15R3921", - .expected_cpu_id = 0x3368, - - .has_uart0 = 1, - .has_uart1 = 1, + .name = "CVG834G_E15R3921", + .expected_cpu_id = 0x3368, - .has_enet0 = 1, - .has_pci = 1, + .ephy_reset_gpio = 36, + .ephy_reset_gpio_flags = GPIOF_INIT_HIGH, + .has_pci = 1, + .has_uart0 = 1, + .has_uart1 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, .leds = { { - .name = "CVG834G:green:power", - .gpio = 37, + .name = "CVG834G:green:power", + .gpio = 37, .default_trigger= "default-on", }, }, - - .ephy_reset_gpio = 36, - .ephy_reset_gpio_flags = GPIOF_INIT_HIGH, }; #endif /* CONFIG_BCM63XX_CPU_3368 */ @@ -74,44 +68,44 @@ static struct board_info __initdata board_cvg834g = { */ #ifdef CONFIG_BCM63XX_CPU_6328 static struct board_info __initdata board_96328avng = { - .name = "96328avng", - .expected_cpu_id = 0x6328, + .name = "96328avng", + .expected_cpu_id = 0x6328, - .has_uart0 = 1, - .has_pci = 1, - .has_usbd = 0, + .has_pci = 1, + .has_uart0 = 1, + .has_usbd = 0, .usbd = { - .use_fullspeed = 0, - .port_no = 0, + .use_fullspeed = 0, + .port_no = 0, }, .leds = { { - .name = "96328avng::ppp-fail", - .gpio = 2, - .active_low = 1, + .name = "96328avng::ppp-fail", + .gpio = 2, + .active_low = 1, }, { - .name = "96328avng::power", - .gpio = 4, - .active_low = 1, + .name = "96328avng::power", + .gpio = 4, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "96328avng::power-fail", - .gpio = 8, - .active_low = 1, + .name = "96328avng::power-fail", + .gpio = 8, + .active_low = 1, }, { - .name = "96328avng::wps", - .gpio = 9, - .active_low = 1, + .name = "96328avng::wps", + .gpio = 9, + .active_low = 1, }, { - .name = "96328avng::ppp", - .gpio = 11, - .active_low = 1, + .name = "96328avng::ppp", + .gpio = 11, + .active_low = 1, }, }, }; @@ -122,85 +116,86 @@ static struct board_info __initdata board_96328avng = { */ #ifdef CONFIG_BCM63XX_CPU_6338 static struct board_info __initdata board_96338gw = { - .name = "96338GW", - .expected_cpu_id = 0x6338, + .name = "96338GW", + .expected_cpu_id = 0x6338, - .has_uart0 = 1, - .has_enet0 = 1, + .has_ohci0 = 1, + .has_uart0 = 1, + + .has_enet0 = 1, .enet0 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - .has_ohci0 = 1, - .leds = { { - .name = "adsl", - .gpio = 3, - .active_low = 1, + .name = "adsl", + .gpio = 3, + .active_low = 1, }, { - .name = "ses", - .gpio = 5, - .active_low = 1, + .name = "ses", + .gpio = 5, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 4, - .active_low = 1, + .name = "ppp-fail", + .gpio = 4, + .active_low = 1, }, { - .name = "power", - .gpio = 0, - .active_low = 1, + .name = "power", + .gpio = 0, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 1, - .active_low = 1, + .name = "stop", + .gpio = 1, + .active_low = 1, } }, }; static struct board_info __initdata board_96338w = { - .name = "96338W", - .expected_cpu_id = 0x6338, + .name = "96338W", + .expected_cpu_id = 0x6338, + + .has_uart0 = 1, - .has_uart0 = 1, - .has_enet0 = 1, + .has_enet0 = 1, .enet0 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, .leds = { { - .name = "adsl", - .gpio = 3, - .active_low = 1, + .name = "adsl", + .gpio = 3, + .active_low = 1, }, { - .name = "ses", - .gpio = 5, - .active_low = 1, + .name = "ses", + .gpio = 5, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 4, - .active_low = 1, + .name = "ppp-fail", + .gpio = 4, + .active_low = 1, }, { - .name = "power", - .gpio = 0, - .active_low = 1, + .name = "power", + .gpio = 0, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 1, - .active_low = 1, + .name = "stop", + .gpio = 1, + .active_low = 1, }, }, }; @@ -211,10 +206,10 @@ static struct board_info __initdata board_96338w = { */ #ifdef CONFIG_BCM63XX_CPU_6345 static struct board_info __initdata board_96345gw2 = { - .name = "96345GW2", - .expected_cpu_id = 0x6345, + .name = "96345GW2", + .expected_cpu_id = 0x6345, - .has_uart0 = 1, + .has_uart0 = 1, }; #endif /* CONFIG_BCM63XX_CPU_6345 */ @@ -223,286 +218,282 @@ static struct board_info __initdata board_96345gw2 = { */ #ifdef CONFIG_BCM63XX_CPU_6348 static struct board_info __initdata board_96348r = { - .name = "96348R", - .expected_cpu_id = 0x6348, + .name = "96348R", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_enet0 = 1, - .has_pci = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, .leds = { { - .name = "adsl-fail", - .gpio = 2, - .active_low = 1, + .name = "adsl-fail", + .gpio = 2, + .active_low = 1, }, { - .name = "ppp", - .gpio = 3, - .active_low = 1, + .name = "ppp", + .gpio = 3, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 4, - .active_low = 1, + .name = "ppp-fail", + .gpio = 4, + .active_low = 1, }, { - .name = "power", - .gpio = 0, - .active_low = 1, + .name = "power", + .gpio = 0, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 1, - .active_low = 1, + .name = "stop", + .gpio = 1, + .active_low = 1, }, }, }; static struct board_info __initdata board_96348gw_10 = { - .name = "96348GW-10", - .expected_cpu_id = 0x6348, + .name = "96348GW-10", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ohci0 = 1, + .has_pccard = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - .has_ohci0 = 1, - .has_pccard = 1, - .has_ehci0 = 1, - .leds = { { - .name = "adsl-fail", - .gpio = 2, - .active_low = 1, + .name = "adsl-fail", + .gpio = 2, + .active_low = 1, }, { - .name = "ppp", - .gpio = 3, - .active_low = 1, + .name = "ppp", + .gpio = 3, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 4, - .active_low = 1, + .name = "ppp-fail", + .gpio = 4, + .active_low = 1, }, { - .name = "power", - .gpio = 0, - .active_low = 1, + .name = "power", + .gpio = 0, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 1, - .active_low = 1, + .name = "stop", + .gpio = 1, + .active_low = 1, }, }, }; static struct board_info __initdata board_96348gw_11 = { - .name = "96348GW-11", - .expected_cpu_id = 0x6348, + .name = "96348GW-11", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ohci0 = 1, + .has_pccard = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - - .has_ohci0 = 1, - .has_pccard = 1, - .has_ehci0 = 1, - .leds = { { - .name = "adsl-fail", - .gpio = 2, - .active_low = 1, + .name = "adsl-fail", + .gpio = 2, + .active_low = 1, }, { - .name = "ppp", - .gpio = 3, - .active_low = 1, + .name = "ppp", + .gpio = 3, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 4, - .active_low = 1, + .name = "ppp-fail", + .gpio = 4, + .active_low = 1, }, { - .name = "power", - .gpio = 0, - .active_low = 1, + .name = "power", + .gpio = 0, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 1, - .active_low = 1, + .name = "stop", + .gpio = 1, + .active_low = 1, }, }, }; static struct board_info __initdata board_96348gw = { - .name = "96348GW", - .expected_cpu_id = 0x6348, + .name = "96348GW", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ohci0 = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - .has_ohci0 = 1, - .leds = { { - .name = "adsl-fail", - .gpio = 2, - .active_low = 1, + .name = "adsl-fail", + .gpio = 2, + .active_low = 1, }, { - .name = "ppp", - .gpio = 3, - .active_low = 1, + .name = "ppp", + .gpio = 3, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 4, - .active_low = 1, + .name = "ppp-fail", + .gpio = 4, + .active_low = 1, }, { - .name = "power", - .gpio = 0, - .active_low = 1, + .name = "power", + .gpio = 0, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 1, - .active_low = 1, + .name = "stop", + .gpio = 1, + .active_low = 1, }, }, }; static struct board_info __initdata board_FAST2404 = { - .name = "F@ST2404", - .expected_cpu_id = 0x6348, + .name = "F@ST2404", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ohci0 = 1, + .has_pccard = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - - .has_ohci0 = 1, - .has_pccard = 1, - .has_ehci0 = 1, }; static struct board_info __initdata board_rta1025w_16 = { - .name = "RTA1025W_16", - .expected_cpu_id = 0x6348, + .name = "RTA1025W_16", + .expected_cpu_id = 0x6348, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_pci = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, }; static struct board_info __initdata board_DV201AMR = { - .name = "DV201AMR", - .expected_cpu_id = 0x6348, + .name = "DV201AMR", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_pci = 1, - .has_ohci0 = 1, + .has_ohci0 = 1, + .has_pci = 1, + .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, }; static struct board_info __initdata board_96348gw_a = { - .name = "96348GW-A", - .expected_cpu_id = 0x6348, + .name = "96348GW-A", + .expected_cpu_id = 0x6348, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ohci0 = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - - .has_ohci0 = 1, }; #endif /* CONFIG_BCM63XX_CPU_6348 */ @@ -511,146 +502,142 @@ static struct board_info __initdata board_96348gw_a = { */ #ifdef CONFIG_BCM63XX_CPU_6358 static struct board_info __initdata board_96358vw = { - .name = "96358VW", - .expected_cpu_id = 0x6358, + .name = "96358VW", + .expected_cpu_id = 0x6358, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ehci0 = 1, + .has_ohci0 = 1, + .has_pccard = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - .has_ohci0 = 1, - .has_pccard = 1, - .has_ehci0 = 1, - .leds = { { - .name = "adsl-fail", - .gpio = 15, - .active_low = 1, + .name = "adsl-fail", + .gpio = 15, + .active_low = 1, }, { - .name = "ppp", - .gpio = 22, - .active_low = 1, + .name = "ppp", + .gpio = 22, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 23, - .active_low = 1, + .name = "ppp-fail", + .gpio = 23, + .active_low = 1, }, { - .name = "power", - .gpio = 4, + .name = "power", + .gpio = 4, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 5, + .name = "stop", + .gpio = 5, }, }, }; static struct board_info __initdata board_96358vw2 = { - .name = "96358VW2", - .expected_cpu_id = 0x6358, + .name = "96358VW2", + .expected_cpu_id = 0x6358, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ehci0 = 1, + .has_ohci0 = 1, + .has_pccard = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - - .has_ohci0 = 1, - .has_pccard = 1, - .has_ehci0 = 1, - .leds = { { - .name = "adsl", - .gpio = 22, - .active_low = 1, + .name = "adsl", + .gpio = 22, + .active_low = 1, }, { - .name = "ppp-fail", - .gpio = 23, + .name = "ppp-fail", + .gpio = 23, }, { - .name = "power", - .gpio = 5, - .active_low = 1, + .name = "power", + .gpio = 5, + .active_low = 1, .default_trigger = "default-on", }, { - .name = "stop", - .gpio = 4, - .active_low = 1, + .name = "stop", + .gpio = 4, + .active_low = 1, }, }, }; static struct board_info __initdata board_AGPFS0 = { - .name = "AGPF-S0", - .expected_cpu_id = 0x6358, + .name = "AGPF-S0", + .expected_cpu_id = 0x6358, - .has_uart0 = 1, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ehci0 = 1, + .has_ohci0 = 1, + .has_pci = 1, + .has_uart0 = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - - .has_ohci0 = 1, - .has_ehci0 = 1, }; static struct board_info __initdata board_DWVS0 = { - .name = "DWV-S0", - .expected_cpu_id = 0x6358, + .name = "DWV-S0", + .expected_cpu_id = 0x6358, - .has_enet0 = 1, - .has_enet1 = 1, - .has_pci = 1, + .has_ehci0 = 1, + .has_ohci0 = 1, + .has_pci = 1, + .has_enet0 = 1, .enet0 = { - .has_phy = 1, - .use_internal_phy = 1, + .has_phy = 1, + .use_internal_phy = 1, }, + .has_enet1 = 1, .enet1 = { - .force_speed_100 = 1, - .force_duplex_full = 1, + .force_speed_100 = 1, + .force_duplex_full = 1, }, - - .has_ohci0 = 1, }; #endif /* CONFIG_BCM63XX_CPU_6358 */ diff --git a/arch/mips/bcm63xx/setup.c b/arch/mips/bcm63xx/setup.c index e28ee9a7cc7e..d811e3e03f81 100644 --- a/arch/mips/bcm63xx/setup.c +++ b/arch/mips/bcm63xx/setup.c @@ -146,7 +146,7 @@ void __init plat_time_init(void) void __init plat_mem_setup(void) { - add_memory_region(0, bcm63xx_get_memory_size(), BOOT_MEM_RAM); + memblock_add(0, bcm63xx_get_memory_size()); _machine_halt = bcm63xx_machine_halt; _machine_restart = __bcm63xx_machine_reboot; diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c index df56bf4179e3..49061b870680 100644 --- a/arch/mips/bmips/dma.c +++ b/arch/mips/bmips/dma.c @@ -40,7 +40,7 @@ static struct bmips_dma_range *bmips_dma_ranges; #define FLUSH_RAC 0x100 -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t pa) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t pa) { struct bmips_dma_range *r; @@ -52,7 +52,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t pa) return pa; } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { struct bmips_dma_range *r; diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 6e56caef69f0..d66511825fe1 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -22,7 +22,12 @@ KBUILD_CFLAGS := $(filter-out -pg, $(KBUILD_CFLAGS)) KBUILD_CFLAGS := $(filter-out -fstack-protector, $(KBUILD_CFLAGS)) -KBUILD_CFLAGS := $(KBUILD_CFLAGS) -D__KERNEL__ \ +# Disable lq/sq in zboot +ifdef CONFIG_CPU_LOONGSON64 +KBUILD_CFLAGS := $(filter-out -march=loongson3a, $(KBUILD_CFLAGS)) -march=mips64r2 +endif + +KBUILD_CFLAGS := $(KBUILD_CFLAGS) -D__KERNEL__ -D__DISABLE_EXPORTS \ -DBOOT_HEAP_SIZE=$(BOOT_HEAP_SIZE) -D"VMLINUX_LOAD_ADDRESS_ULL=$(VMLINUX_LOAD_ADDRESS)ull" KBUILD_AFLAGS := $(KBUILD_AFLAGS) -D__ASSEMBLY__ \ @@ -70,6 +75,7 @@ tool_$(CONFIG_KERNEL_LZ4) = lz4 tool_$(CONFIG_KERNEL_LZMA) = lzma tool_$(CONFIG_KERNEL_LZO) = lzo tool_$(CONFIG_KERNEL_XZ) = xzkern +tool_$(CONFIG_KERNEL_ZSTD) = zstd22 targets += vmlinux.bin.z $(obj)/vmlinux.bin.z: $(obj)/vmlinux.bin FORCE diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c index 88f5d637b1c4..c61c641674e6 100644 --- a/arch/mips/boot/compressed/decompress.c +++ b/arch/mips/boot/compressed/decompress.c @@ -72,6 +72,10 @@ void error(char *x) #include "../../../../lib/decompress_unxz.c" #endif +#ifdef CONFIG_KERNEL_ZSTD +#include "../../../../lib/decompress_unzstd.c" +#endif + const unsigned long __stack_chk_guard = 0x000a0dff; void __stack_chk_fail(void) diff --git a/arch/mips/boot/compressed/string.c b/arch/mips/boot/compressed/string.c index 43beecc3587c..0b593b709228 100644 --- a/arch/mips/boot/compressed/string.c +++ b/arch/mips/boot/compressed/string.c @@ -5,6 +5,7 @@ * Very small subset of simple string routines */ +#include <linux/compiler_attributes.h> #include <linux/types.h> void *memcpy(void *dest, const void *src, size_t n) @@ -27,3 +28,19 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } + +void * __weak memmove(void *dest, const void *src, size_t n) +{ + unsigned int i; + const char *s = src; + char *d = dest; + + if ((uintptr_t)dest < (uintptr_t)src) { + for (i = 0; i < n; i++) + d[i] = s[i]; + } else { + for (i = n; i > 0; i--) + d[i - 1] = s[i - 1]; + } + return dest; +} diff --git a/arch/mips/boot/dts/ingenic/jz4725b.dtsi b/arch/mips/boot/dts/ingenic/jz4725b.dtsi index a8fca560878d..a1f0b71c9223 100644 --- a/arch/mips/boot/dts/ingenic/jz4725b.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4725b.dtsi @@ -7,6 +7,20 @@ #size-cells = <1>; compatible = "ingenic,jz4725b"; + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "ingenic,xburst-mxu1.0"; + reg = <0>; + + clocks = <&cgu JZ4725B_CLK_CCLK>; + clock-names = "cpu"; + }; + }; + cpuintc: interrupt-controller { #address-cells = <0>; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/ingenic/jz4740.dtsi b/arch/mips/boot/dts/ingenic/jz4740.dtsi index 1520585c235c..eee523678ce5 100644 --- a/arch/mips/boot/dts/ingenic/jz4740.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4740.dtsi @@ -7,6 +7,20 @@ #size-cells = <1>; compatible = "ingenic,jz4740"; + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "ingenic,xburst-mxu1.0"; + reg = <0>; + + clocks = <&cgu JZ4740_CLK_CCLK>; + clock-names = "cpu"; + }; + }; + cpuintc: interrupt-controller { #address-cells = <0>; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/ingenic/jz4770.dtsi b/arch/mips/boot/dts/ingenic/jz4770.dtsi index fa11ac950499..018721a9eea9 100644 --- a/arch/mips/boot/dts/ingenic/jz4770.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4770.dtsi @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 - #include <dt-bindings/clock/jz4770-cgu.h> #include <dt-bindings/clock/ingenic,tcu.h> @@ -8,6 +7,20 @@ #size-cells = <1>; compatible = "ingenic,jz4770"; + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "ingenic,xburst-fpu1.0-mxu1.1"; + reg = <0>; + + clocks = <&cgu JZ4770_CLK_CCLK>; + clock-names = "cpu"; + }; + }; + cpuintc: interrupt-controller { #address-cells = <0>; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi index b7f409a7cf5d..dfb5a7e1bb21 100644 --- a/arch/mips/boot/dts/ingenic/jz4780.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi @@ -8,6 +8,29 @@ #size-cells = <1>; compatible = "ingenic,jz4780"; + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "ingenic,xburst-fpu1.0-mxu1.1"; + reg = <0>; + + clocks = <&cgu JZ4780_CLK_CPU>; + clock-names = "cpu"; + }; + + cpu1: cpu@1 { + device_type = "cpu"; + compatible = "ingenic,xburst-fpu1.0-mxu1.1"; + reg = <1>; + + clocks = <&cgu JZ4780_CLK_CORE1>; + clock-names = "cpu"; + }; + }; + cpuintc: interrupt-controller { #address-cells = <0>; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/ingenic/qi_lb60.dts b/arch/mips/boot/dts/ingenic/qi_lb60.dts index bf298268f1a1..ba0218971572 100644 --- a/arch/mips/boot/dts/ingenic/qi_lb60.dts +++ b/arch/mips/boot/dts/ingenic/qi_lb60.dts @@ -109,74 +109,73 @@ debounce-delay-ms = <10>; wakeup-source; - row-gpios = <&gpd 18 0 &gpd 19 0 &gpd 20 0 &gpd 21 0 - &gpd 22 0 &gpd 23 0 &gpd 24 0 &gpd 26 0>; - col-gpios = <&gpc 10 0 &gpc 11 0 &gpc 12 0 &gpc 13 0 - &gpc 14 0 &gpc 15 0 &gpc 16 0 &gpc 17 0>; + row-gpios = <&gpd 18 0>, <&gpd 19 0>, <&gpd 20 0>, <&gpd 21 0>, + <&gpd 22 0>, <&gpd 23 0>, <&gpd 24 0>, <&gpd 26 0>; + col-gpios = <&gpc 10 0>, <&gpc 11 0>, <&gpc 12 0>, <&gpc 13 0>, + <&gpc 14 0>, <&gpc 15 0>, <&gpc 16 0>, <&gpc 17 0>; gpio-activelow; - linux,keymap = < - MATRIX_KEY(0, 0, KEY_F1) /* S2 */ - MATRIX_KEY(0, 1, KEY_F2) /* S3 */ - MATRIX_KEY(0, 2, KEY_F3) /* S4 */ - MATRIX_KEY(0, 3, KEY_F4) /* S5 */ - MATRIX_KEY(0, 4, KEY_F5) /* S6 */ - MATRIX_KEY(0, 5, KEY_F6) /* S7 */ - MATRIX_KEY(0, 6, KEY_F7) /* S8 */ - - MATRIX_KEY(1, 0, KEY_Q) /* S10 */ - MATRIX_KEY(1, 1, KEY_W) /* S11 */ - MATRIX_KEY(1, 2, KEY_E) /* S12 */ - MATRIX_KEY(1, 3, KEY_R) /* S13 */ - MATRIX_KEY(1, 4, KEY_T) /* S14 */ - MATRIX_KEY(1, 5, KEY_Y) /* S15 */ - MATRIX_KEY(1, 6, KEY_U) /* S16 */ - MATRIX_KEY(1, 7, KEY_I) /* S17 */ - MATRIX_KEY(2, 0, KEY_A) /* S18 */ - MATRIX_KEY(2, 1, KEY_S) /* S19 */ - MATRIX_KEY(2, 2, KEY_D) /* S20 */ - MATRIX_KEY(2, 3, KEY_F) /* S21 */ - MATRIX_KEY(2, 4, KEY_G) /* S22 */ - MATRIX_KEY(2, 5, KEY_H) /* S23 */ - MATRIX_KEY(2, 6, KEY_J) /* S24 */ - MATRIX_KEY(2, 7, KEY_K) /* S25 */ - MATRIX_KEY(3, 0, KEY_ESC) /* S26 */ - MATRIX_KEY(3, 1, KEY_Z) /* S27 */ - MATRIX_KEY(3, 2, KEY_X) /* S28 */ - MATRIX_KEY(3, 3, KEY_C) /* S29 */ - MATRIX_KEY(3, 4, KEY_V) /* S30 */ - MATRIX_KEY(3, 5, KEY_B) /* S31 */ - MATRIX_KEY(3, 6, KEY_N) /* S32 */ - MATRIX_KEY(3, 7, KEY_M) /* S33 */ - MATRIX_KEY(4, 0, KEY_TAB) /* S34 */ - MATRIX_KEY(4, 1, KEY_CAPSLOCK) /* S35 */ - MATRIX_KEY(4, 2, KEY_BACKSLASH) /* S36 */ - MATRIX_KEY(4, 3, KEY_APOSTROPHE) /* S37 */ - MATRIX_KEY(4, 4, KEY_COMMA) /* S38 */ - MATRIX_KEY(4, 5, KEY_DOT) /* S39 */ - MATRIX_KEY(4, 6, KEY_SLASH) /* S40 */ - MATRIX_KEY(4, 7, KEY_UP) /* S41 */ - MATRIX_KEY(5, 0, KEY_O) /* S42 */ - MATRIX_KEY(5, 1, KEY_L) /* S43 */ - MATRIX_KEY(5, 2, KEY_EQUAL) /* S44 */ - MATRIX_KEY(5, 3, KEY_QI_UPRED) /* S45 */ - MATRIX_KEY(5, 4, KEY_SPACE) /* S46 */ - MATRIX_KEY(5, 5, KEY_QI_QI) /* S47 */ - MATRIX_KEY(5, 6, KEY_RIGHTCTRL) /* S48 */ - MATRIX_KEY(5, 7, KEY_LEFT) /* S49 */ - MATRIX_KEY(6, 0, KEY_F8) /* S50 */ - MATRIX_KEY(6, 1, KEY_P) /* S51 */ - MATRIX_KEY(6, 2, KEY_BACKSPACE)/* S52 */ - MATRIX_KEY(6, 3, KEY_ENTER) /* S53 */ - MATRIX_KEY(6, 4, KEY_QI_VOLUP) /* S54 */ - MATRIX_KEY(6, 5, KEY_QI_VOLDOWN) /* S55 */ - MATRIX_KEY(6, 6, KEY_DOWN) /* S56 */ - MATRIX_KEY(6, 7, KEY_RIGHT) /* S57 */ - - MATRIX_KEY(7, 0, KEY_LEFTSHIFT) /* S58 */ - MATRIX_KEY(7, 1, KEY_LEFTALT) /* S59 */ - MATRIX_KEY(7, 2, KEY_QI_FN) /* S60 */ - >; + linux,keymap = + <MATRIX_KEY(0, 0, KEY_F1)>, /* S2 */ + <MATRIX_KEY(0, 1, KEY_F2)>, /* S3 */ + <MATRIX_KEY(0, 2, KEY_F3)>, /* S4 */ + <MATRIX_KEY(0, 3, KEY_F4)>, /* S5 */ + <MATRIX_KEY(0, 4, KEY_F5)>, /* S6 */ + <MATRIX_KEY(0, 5, KEY_F6)>, /* S7 */ + <MATRIX_KEY(0, 6, KEY_F7)>, /* S8 */ + + <MATRIX_KEY(1, 0, KEY_Q)>, /* S10 */ + <MATRIX_KEY(1, 1, KEY_W)>, /* S11 */ + <MATRIX_KEY(1, 2, KEY_E)>, /* S12 */ + <MATRIX_KEY(1, 3, KEY_R)>, /* S13 */ + <MATRIX_KEY(1, 4, KEY_T)>, /* S14 */ + <MATRIX_KEY(1, 5, KEY_Y)>, /* S15 */ + <MATRIX_KEY(1, 6, KEY_U)>, /* S16 */ + <MATRIX_KEY(1, 7, KEY_I)>, /* S17 */ + <MATRIX_KEY(2, 0, KEY_A)>, /* S18 */ + <MATRIX_KEY(2, 1, KEY_S)>, /* S19 */ + <MATRIX_KEY(2, 2, KEY_D)>, /* S20 */ + <MATRIX_KEY(2, 3, KEY_F)>, /* S21 */ + <MATRIX_KEY(2, 4, KEY_G)>, /* S22 */ + <MATRIX_KEY(2, 5, KEY_H)>, /* S23 */ + <MATRIX_KEY(2, 6, KEY_J)>, /* S24 */ + <MATRIX_KEY(2, 7, KEY_K)>, /* S25 */ + <MATRIX_KEY(3, 0, KEY_ESC)>, /* S26 */ + <MATRIX_KEY(3, 1, KEY_Z)>, /* S27 */ + <MATRIX_KEY(3, 2, KEY_X)>, /* S28 */ + <MATRIX_KEY(3, 3, KEY_C)>, /* S29 */ + <MATRIX_KEY(3, 4, KEY_V)>, /* S30 */ + <MATRIX_KEY(3, 5, KEY_B)>, /* S31 */ + <MATRIX_KEY(3, 6, KEY_N)>, /* S32 */ + <MATRIX_KEY(3, 7, KEY_M)>, /* S33 */ + <MATRIX_KEY(4, 0, KEY_TAB)>, /* S34 */ + <MATRIX_KEY(4, 1, KEY_CAPSLOCK)>, /* S35 */ + <MATRIX_KEY(4, 2, KEY_BACKSLASH)>, /* S36 */ + <MATRIX_KEY(4, 3, KEY_APOSTROPHE)>, /* S37 */ + <MATRIX_KEY(4, 4, KEY_COMMA)>, /* S38 */ + <MATRIX_KEY(4, 5, KEY_DOT)>, /* S39 */ + <MATRIX_KEY(4, 6, KEY_SLASH)>, /* S40 */ + <MATRIX_KEY(4, 7, KEY_UP)>, /* S41 */ + <MATRIX_KEY(5, 0, KEY_O)>, /* S42 */ + <MATRIX_KEY(5, 1, KEY_L)>, /* S43 */ + <MATRIX_KEY(5, 2, KEY_EQUAL)>, /* S44 */ + <MATRIX_KEY(5, 3, KEY_QI_UPRED)>, /* S45 */ + <MATRIX_KEY(5, 4, KEY_SPACE)>, /* S46 */ + <MATRIX_KEY(5, 5, KEY_QI_QI)>, /* S47 */ + <MATRIX_KEY(5, 6, KEY_RIGHTCTRL)>, /* S48 */ + <MATRIX_KEY(5, 7, KEY_LEFT)>, /* S49 */ + <MATRIX_KEY(6, 0, KEY_F8)>, /* S50 */ + <MATRIX_KEY(6, 1, KEY_P)>, /* S51 */ + <MATRIX_KEY(6, 2, KEY_BACKSPACE)>,/* S52 */ + <MATRIX_KEY(6, 3, KEY_ENTER)>, /* S53 */ + <MATRIX_KEY(6, 4, KEY_QI_VOLUP)>, /* S54 */ + <MATRIX_KEY(6, 5, KEY_QI_VOLDOWN)>, /* S55 */ + <MATRIX_KEY(6, 6, KEY_DOWN)>, /* S56 */ + <MATRIX_KEY(6, 7, KEY_RIGHT)>, /* S57 */ + + <MATRIX_KEY(7, 0, KEY_LEFTSHIFT)>, /* S58 */ + <MATRIX_KEY(7, 1, KEY_LEFTALT)>, /* S59 */ + <MATRIX_KEY(7, 2, KEY_QI_FN)>; /* S60 */ }; spi { @@ -261,12 +260,12 @@ #address-cells = <1>; #size-cells = <0>; - ingenic,bch-controller = <&ecc>; + ecc-engine = <&ecc>; pinctrl-names = "default"; pinctrl-0 = <&pins_nemc>; - rb-gpios = <&gpc 30 GPIO_ACTIVE_LOW>; + rb-gpios = <&gpc 30 GPIO_ACTIVE_HIGH>; nand@1 { reg = <1>; @@ -324,7 +323,7 @@ pins_nemc: nemc { function = "nand"; - groups = "nand-cs1"; + groups = "nand-fre-fwe", "nand-cs1"; }; pins_uart0: uart0 { diff --git a/arch/mips/boot/dts/ingenic/x1000.dtsi b/arch/mips/boot/dts/ingenic/x1000.dtsi index 9de9e7c2d523..1f1f896dd1f7 100644 --- a/arch/mips/boot/dts/ingenic/x1000.dtsi +++ b/arch/mips/boot/dts/ingenic/x1000.dtsi @@ -8,6 +8,20 @@ #size-cells = <1>; compatible = "ingenic,x1000", "ingenic,x1000e"; + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "ingenic,xburst-fpu1.0-mxu1.1"; + reg = <0>; + + clocks = <&cgu X1000_CLK_CPU>; + clock-names = "cpu"; + }; + }; + cpuintc: interrupt-controller { #address-cells = <0>; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/ingenic/x1830.dtsi b/arch/mips/boot/dts/ingenic/x1830.dtsi index eb1214481a33..b05dac3ae308 100644 --- a/arch/mips/boot/dts/ingenic/x1830.dtsi +++ b/arch/mips/boot/dts/ingenic/x1830.dtsi @@ -8,6 +8,20 @@ #size-cells = <1>; compatible = "ingenic,x1830"; + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "ingenic,xburst-fpu2.0-mxu2.0"; + reg = <0>; + + clocks = <&cgu X1830_CLK_CPU>; + clock-names = "cpu"; + }; + }; + cpuintc: interrupt-controller { #address-cells = <0>; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/loongson/ls7a-pch.dtsi b/arch/mips/boot/dts/loongson/ls7a-pch.dtsi index e574a062dfae..f99a7a11fded 100644 --- a/arch/mips/boot/dts/loongson/ls7a-pch.dtsi +++ b/arch/mips/boot/dts/loongson/ls7a-pch.dtsi @@ -19,6 +19,45 @@ #interrupt-cells = <2>; }; + ls7a_uart0: serial@10080000 { + compatible = "ns16550a"; + reg = <0 0x10080000 0 0x100>; + clock-frequency = <50000000>; + interrupt-parent = <&pic>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + no-loopback-test; + }; + + ls7a_uart1: serial@10080100 { + status = "disabled"; + compatible = "ns16550a"; + reg = <0 0x10080100 0 0x100>; + clock-frequency = <50000000>; + interrupt-parent = <&pic>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + no-loopback-test; + }; + + ls7a_uart2: serial@10080200 { + status = "disabled"; + compatible = "ns16550a"; + reg = <0 0x10080200 0 0x100>; + clock-frequency = <50000000>; + interrupt-parent = <&pic>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + no-loopback-test; + }; + + ls7a_uart3: serial@10080300 { + status = "disabled"; + compatible = "ns16550a"; + reg = <0 0x10080300 0 0x100>; + clock-frequency = <50000000>; + interrupt-parent = <&pic>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + no-loopback-test; + }; + pci@1a000000 { compatible = "loongson,ls7a-pci"; device_type = "pci"; diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi index f94e8a02ed06..535a98284dcb 100644 --- a/arch/mips/boot/dts/mscc/ocelot.dtsi +++ b/arch/mips/boot/dts/mscc/ocelot.dtsi @@ -134,11 +134,13 @@ <0x1280000 0x100>, <0x1800000 0x80000>, <0x1880000 0x10000>, + <0x1040000 0x10000>, + <0x1050000 0x10000>, <0x1060000 0x10000>; reg-names = "sys", "rew", "qs", "ptp", "port0", "port1", "port2", "port3", "port4", "port5", "port6", "port7", "port8", "port9", "port10", "qsys", - "ana", "s2"; + "ana", "s0", "s1", "s2"; interrupts = <18 21 22>; interrupt-names = "ptp_rdy", "xtr", "inj"; diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c index ad1aecc4b401..df70308db0e6 100644 --- a/arch/mips/cavium-octeon/dma-octeon.c +++ b/arch/mips/cavium-octeon/dma-octeon.c @@ -168,7 +168,7 @@ void __init octeon_pci_dma_init(void) } #endif /* CONFIG_PCI */ -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { #ifdef CONFIG_PCI if (dev && dev_is_pci(dev)) @@ -177,7 +177,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) return paddr; } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { #ifdef CONFIG_PCI if (dev && dev_is_pci(dev)) diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index 4f34d92b52f9..561389d3fadb 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -16,6 +16,7 @@ #include <linux/export.h> #include <linux/interrupt.h> #include <linux/io.h> +#include <linux/memblock.h> #include <linux/serial.h> #include <linux/smp.h> #include <linux/types.h> @@ -930,7 +931,7 @@ static __init void memory_exclude_page(u64 addr, u64 *mem, u64 *size) { if (addr > *mem && addr < *mem + *size) { u64 inc = addr - *mem; - add_memory_region(*mem, inc, BOOT_MEM_RAM); + memblock_add(*mem, inc); *mem += inc; *size -= inc; } @@ -992,19 +993,18 @@ void __init plat_mem_setup(void) /* Crashkernel ignores bootmem list. It relies on mem=X@Y option */ #ifdef CONFIG_CRASH_DUMP - add_memory_region(reserve_low_mem, max_memory, BOOT_MEM_RAM); + memblock_add(reserve_low_mem, max_memory); total += max_memory; #else #ifdef CONFIG_KEXEC if (crashk_size > 0) { - add_memory_region(crashk_base, crashk_size, BOOT_MEM_RAM); + memblock_add(crashk_base, crashk_size); crashk_end = crashk_base + crashk_size; } #endif /* - * When allocating memory, we want incrementing addresses from - * bootmem_alloc so the code in add_memory_region can merge - * regions next to each other. + * When allocating memory, we want incrementing addresses, + * which is handled by memblock */ cvmx_bootmem_lock(); while (total < max_memory) { @@ -1039,13 +1039,9 @@ void __init plat_mem_setup(void) */ if (memory < crashk_base && end > crashk_end) { /* region is fully in */ - add_memory_region(memory, - crashk_base - memory, - BOOT_MEM_RAM); + memblock_add(memory, crashk_base - memory); total += crashk_base - memory; - add_memory_region(crashk_end, - end - crashk_end, - BOOT_MEM_RAM); + memblock_add(crashk_end, end - crashk_end); total += end - crashk_end; continue; } @@ -1073,7 +1069,7 @@ void __init plat_mem_setup(void) */ mem_alloc_size -= end - crashk_base; #endif - add_memory_region(memory, mem_alloc_size, BOOT_MEM_RAM); + memblock_add(memory, mem_alloc_size); total += mem_alloc_size; /* Recovering mem_alloc_size */ mem_alloc_size = 4 << 20; @@ -1088,7 +1084,7 @@ void __init plat_mem_setup(void) /* Adjust for physical offset. */ kernel_start &= ~0xffffffff80000000ULL; - add_memory_region(kernel_start, kernel_size, BOOT_MEM_RAM); + memblock_add(kernel_start, kernel_size); #endif /* CONFIG_CRASH_DUMP */ #ifdef CONFIG_CAVIUM_RESERVE32 @@ -1126,7 +1122,7 @@ EXPORT_SYMBOL(prom_putchar); void __init prom_free_prom_memory(void) { - if (CAVIUM_OCTEON_DCACHE_PREFETCH_WAR) { + if (OCTEON_IS_MODEL(OCTEON_CN6XXX)) { /* Check for presence of Core-14449 fix. */ u32 insn; u32 *foo; diff --git a/arch/mips/cobalt/setup.c b/arch/mips/cobalt/setup.c index c136a18c7221..46581e686882 100644 --- a/arch/mips/cobalt/setup.c +++ b/arch/mips/cobalt/setup.c @@ -13,6 +13,7 @@ #include <linux/interrupt.h> #include <linux/io.h> #include <linux/ioport.h> +#include <linux/memblock.h> #include <linux/pm.h> #include <asm/bootinfo.h> @@ -112,7 +113,7 @@ void __init prom_init(void) strlcat(arcs_cmdline, " ", COMMAND_LINE_SIZE); } - add_memory_region(0x0, memsz, BOOT_MEM_RAM); + memblock_add(0, memsz); setup_8250_early_printk_port(CKSEG1ADDR(0x1c800000), 0, 0); } diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index 0a46199fdc3f..052c5ad0f2b1 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -22,7 +22,7 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLAB=y -CONFIG_MACH_INGENIC=y +CONFIG_MACH_INGENIC_SOC=y CONFIG_JZ4780_CI20=y CONFIG_HIGHMEM=y CONFIG_HZ_100=y @@ -42,7 +42,7 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y -# CONFIG_FW_LOADER is not set +CONFIG_FW_LOADER=m # CONFIG_ALLOW_DEV_COREDUMP is not set CONFIG_MTD=y CONFIG_MTD_RAW_NAND=y diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index e924c817f73d..55d0690a3ffe 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -1,5 +1,3 @@ -CONFIG_LOCALVERSION_AUTO=y -CONFIG_KERNEL_GZIP=y CONFIG_SYSVIPC=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y @@ -9,7 +7,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_KMEM=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y @@ -22,7 +19,7 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLAB=y -CONFIG_MACH_INGENIC=y +CONFIG_MACH_INGENIC_SOC=y CONFIG_X1000_CU1000_NEO=y CONFIG_HIGHMEM=y CONFIG_HZ_100=y @@ -31,7 +28,6 @@ CONFIG_HZ_100=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPACTION is not set CONFIG_CMA=y -CONFIG_CMA_AREAS=7 CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -40,19 +36,16 @@ CONFIG_CFG80211=y CONFIG_UEVENT_HELPER=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y -# CONFIG_FW_LOADER is not set # CONFIG_ALLOW_DEV_COREDUMP is not set CONFIG_NETDEVICES=y CONFIG_STMMAC_ETH=y CONFIG_SMSC_PHY=y CONFIG_BRCMFMAC=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_VT_HW_CONSOLE_BINDING=y CONFIG_LEGACY_PTY_COUNT=2 -CONFIG_SERIAL_EARLYCON=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=3 @@ -66,8 +59,6 @@ CONFIG_GPIO_SYSFS=y CONFIG_SENSORS_ADS7828=y CONFIG_WATCHDOG=y CONFIG_JZ4740_WDT=y -# CONFIG_LCD_CLASS_DEVICE is not set -# CONFIG_BACKLIGHT_CLASS_DEVICE is not set # CONFIG_VGA_CONSOLE is not set # CONFIG_HID is not set # CONFIG_USB_SUPPORT is not set @@ -82,8 +73,6 @@ CONFIG_RTC_DRV_JZ4740=y CONFIG_DMADEVICES=y CONFIG_DMA_JZ4780=y # CONFIG_IOMMU_SUPPORT is not set -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set CONFIG_AUTOFS_FS=y @@ -108,8 +97,8 @@ CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 CONFIG_DEBUG_INFO=y CONFIG_STRIP_ASM_SYMS=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/cu1830-neo_defconfig b/arch/mips/configs/cu1830-neo_defconfig index cbfb62900273..e7064851a47a 100644 --- a/arch/mips/configs/cu1830-neo_defconfig +++ b/arch/mips/configs/cu1830-neo_defconfig @@ -1,5 +1,3 @@ -CONFIG_LOCALVERSION_AUTO=y -CONFIG_KERNEL_GZIP=y CONFIG_SYSVIPC=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y @@ -9,7 +7,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_KMEM=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y @@ -22,7 +19,7 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLAB=y -CONFIG_MACH_INGENIC=y +CONFIG_MACH_INGENIC_SOC=y CONFIG_X1830_CU1830_NEO=y CONFIG_HIGHMEM=y CONFIG_HZ_100=y @@ -31,7 +28,6 @@ CONFIG_HZ_100=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPACTION is not set CONFIG_CMA=y -CONFIG_CMA_AREAS=7 CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -40,7 +36,6 @@ CONFIG_CFG80211=y CONFIG_UEVENT_HELPER=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y -# CONFIG_FW_LOADER is not set # CONFIG_ALLOW_DEV_COREDUMP is not set CONFIG_MD=y CONFIG_BLK_DEV_MD=y @@ -49,13 +44,11 @@ CONFIG_NETDEVICES=y CONFIG_STMMAC_ETH=y CONFIG_ICPLUS_PHY=y CONFIG_BRCMFMAC=y -# CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_VT_HW_CONSOLE_BINDING=y CONFIG_LEGACY_PTY_COUNT=2 -CONFIG_SERIAL_EARLYCON=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=2 @@ -69,8 +62,6 @@ CONFIG_GPIO_SYSFS=y CONFIG_SENSORS_ADS7828=y CONFIG_WATCHDOG=y CONFIG_JZ4740_WDT=y -# CONFIG_LCD_CLASS_DEVICE is not set -# CONFIG_BACKLIGHT_CLASS_DEVICE is not set # CONFIG_VGA_CONSOLE is not set # CONFIG_HID is not set # CONFIG_USB_SUPPORT is not set @@ -85,8 +76,6 @@ CONFIG_RTC_DRV_JZ4740=y CONFIG_DMADEVICES=y CONFIG_DMA_JZ4780=y # CONFIG_IOMMU_SUPPORT is not set -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set CONFIG_AUTOFS_FS=y @@ -111,8 +100,8 @@ CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 CONFIG_DEBUG_INFO=y CONFIG_STRIP_ASM_SYMS=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/gcw0_defconfig b/arch/mips/configs/gcw0_defconfig index 4994749b9eaa..7e28a4fe9d84 100644 --- a/arch/mips/configs/gcw0_defconfig +++ b/arch/mips/configs/gcw0_defconfig @@ -4,7 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_PREEMPT_VOLUNTARY=y CONFIG_EMBEDDED=y CONFIG_PROFILING=y -CONFIG_MACH_INGENIC=y +CONFIG_MACH_INGENIC_SOC=y CONFIG_JZ4770_GCW0=y CONFIG_HIGHMEM=y # CONFIG_SECCOMP is not set diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig index a65b08de4098..38a817ead8e7 100644 --- a/arch/mips/configs/loongson3_defconfig +++ b/arch/mips/configs/loongson3_defconfig @@ -30,7 +30,6 @@ CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y CONFIG_MACH_LOONGSON64=y CONFIG_CPU_HAS_MSA=y -CONFIG_SMP=y CONFIG_NR_CPUS=16 CONFIG_HZ_256=y CONFIG_KEXEC=y @@ -403,7 +402,6 @@ CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_DEFLATE=m CONFIG_PRINTK_TIME=y -CONFIG_FRAME_WARN=1024 CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig deleted file mode 100644 index d06db6b87959..000000000000 --- a/arch/mips/configs/pnx8335_stb225_defconfig +++ /dev/null @@ -1,77 +0,0 @@ -# CONFIG_LOCALVERSION_AUTO is not set -# CONFIG_SWAP is not set -CONFIG_SYSVIPC=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_PREEMPT_VOLUNTARY=y -CONFIG_LOG_BUF_SHIFT=14 -CONFIG_EXPERT=y -CONFIG_SLAB=y -CONFIG_NXP_STB225=y -CONFIG_CPU_LITTLE_ENDIAN=y -CONFIG_HZ_128=y -# CONFIG_SECCOMP is not set -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_INET_AH=y -# CONFIG_IPV6 is not set -CONFIG_MTD=y -CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_CFI=y -CONFIG_MTD_CFI_ADV_OPTIONS=y -CONFIG_MTD_CFI_LE_BYTE_SWAP=y -CONFIG_MTD_CFI_GEOMETRY=y -CONFIG_MTD_CFI_AMDSTD=y -CONFIG_MTD_PHYSMAP=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_SD=y -# CONFIG_SCSI_LOWLEVEL is not set -CONFIG_ATA=y -CONFIG_NETDEVICES=y -CONFIG_INPUT_EVDEV=m -CONFIG_INPUT_EVBUG=m -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_VT_CONSOLE is not set -# CONFIG_LEGACY_PTYS is not set -CONFIG_SERIAL_PNX8XXX=y -CONFIG_SERIAL_PNX8XXX_CONSOLE=y -CONFIG_HW_RANDOM=y -CONFIG_I2C=y -CONFIG_I2C_CHARDEV=y -# CONFIG_HWMON is not set -CONFIG_FB=y -# CONFIG_VGA_CONSOLE is not set -CONFIG_SOUND=m -CONFIG_SND=m -CONFIG_SND_VERBOSE_PRINTK=y -CONFIG_SND_DEBUG=y -CONFIG_SND_SEQUENCER=m -CONFIG_EXT2_FS=m -# CONFIG_DNOTIFY is not set -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_TMPFS=y -CONFIG_JFFS2_FS=y -CONFIG_CRAMFS=y -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y -CONFIG_NFSD=m -CONFIG_NFSD_V3=y -CONFIG_NLS=y -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_UTF8=m diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig index 81bfbee72b0c..b4448d0876d5 100644 --- a/arch/mips/configs/qi_lb60_defconfig +++ b/arch/mips/configs/qi_lb60_defconfig @@ -7,7 +7,8 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLAB=y -CONFIG_MACH_INGENIC=y +CONFIG_MACH_INGENIC_SOC=y +CONFIG_JZ4740_QI_LB60=y CONFIG_HZ_100=y # CONFIG_SECCOMP is not set CONFIG_MODULES=y @@ -72,9 +73,7 @@ CONFIG_DRM=y CONFIG_DRM_FBDEV_OVERALLOC=200 CONFIG_DRM_PANEL_SIMPLE=y CONFIG_DRM_INGENIC=y -# CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_GENERIC is not set # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -170,9 +169,9 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y CONFIG_STRIP_ASM_SYMS=y CONFIG_READABLE_ASM=y +CONFIG_KGDB=y CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_PANIC_ON_OOPS=y # CONFIG_FTRACE is not set -CONFIG_KGDB=y diff --git a/arch/mips/configs/rs90_defconfig b/arch/mips/configs/rs90_defconfig index de6752051ecc..dfbb9fed9a42 100644 --- a/arch/mips/configs/rs90_defconfig +++ b/arch/mips/configs/rs90_defconfig @@ -19,7 +19,7 @@ CONFIG_EMBEDDED=y # CONFIG_PERF_EVENTS is not set CONFIG_SLAB=y CONFIG_PROFILING=y -CONFIG_MACH_INGENIC=y +CONFIG_MACH_INGENIC_SOC=y CONFIG_JZ4740_RS90=y CONFIG_PAGE_SIZE_16KB=y CONFIG_HZ_100=y @@ -80,8 +80,8 @@ CONFIG_KEYBOARD_GPIO=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=2 -# CONFIG_DEVMEM is not set # CONFIG_HW_RANDOM is not set +# CONFIG_DEVMEM is not set # CONFIG_I2C_COMPAT is not set # CONFIG_I2C_HELPER_AUTO is not set CONFIG_POWER_SUPPLY=y diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c index 5073d2ed78bb..44490c30d63b 100644 --- a/arch/mips/dec/prom/memory.c +++ b/arch/mips/dec/prom/memory.c @@ -12,7 +12,6 @@ #include <linux/types.h> #include <asm/addrspace.h> -#include <asm/bootinfo.h> #include <asm/dec/machtype.h> #include <asm/dec/prom.h> #include <asm/page.h> @@ -28,7 +27,7 @@ volatile unsigned long mem_err; /* So we know an error occurred */ #define CHUNK_SIZE 0x400000 -static inline void pmax_setup_memory_region(void) +static __init void pmax_setup_memory_region(void) { volatile unsigned char *memory_page, dummy; char old_handler[0x80]; @@ -50,15 +49,14 @@ static inline void pmax_setup_memory_region(void) } memcpy((void *)(CKSEG0 + 0x80), &old_handler, 0x80); - add_memory_region(0, (unsigned long)memory_page - CKSEG1 - CHUNK_SIZE, - BOOT_MEM_RAM); + memblock_add(0, (unsigned long)memory_page - CKSEG1 - CHUNK_SIZE); } /* * Use the REX prom calls to get hold of the memory bitmap, and thence * determine memory size. */ -static inline void rex_setup_memory_region(void) +static __init void rex_setup_memory_region(void) { int i, bitmap_size; unsigned long mem_start = 0, mem_size = 0; @@ -76,13 +74,13 @@ static inline void rex_setup_memory_region(void) else if (!mem_size) mem_start += (8 * bm->pagesize); else { - add_memory_region(mem_start, mem_size, BOOT_MEM_RAM); + memblock_add(mem_start, mem_size); mem_start += mem_size + (8 * bm->pagesize); mem_size = 0; } } if (mem_size) - add_memory_region(mem_start, mem_size, BOOT_MEM_RAM); + memblock_add(mem_start, mem_size); } void __init prom_meminit(u32 magic) diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c index d4e868b828e5..eaad0ed4b523 100644 --- a/arch/mips/dec/setup.c +++ b/arch/mips/dec/setup.c @@ -6,7 +6,7 @@ * for more details. * * Copyright (C) 1998 Harald Koerfgen - * Copyright (C) 2000, 2001, 2002, 2003, 2005 Maciej W. Rozycki + * Copyright (C) 2000, 2001, 2002, 2003, 2005, 2020 Maciej W. Rozycki */ #include <linux/console.h> #include <linux/export.h> @@ -15,6 +15,7 @@ #include <linux/ioport.h> #include <linux/irq.h> #include <linux/irqnr.h> +#include <linux/memblock.h> #include <linux/param.h> #include <linux/percpu-defs.h> #include <linux/sched.h> @@ -22,6 +23,7 @@ #include <linux/types.h> #include <linux/pm.h> +#include <asm/addrspace.h> #include <asm/bootinfo.h> #include <asm/cpu.h> #include <asm/cpu-features.h> @@ -29,7 +31,9 @@ #include <asm/irq.h> #include <asm/irq_cpu.h> #include <asm/mipsregs.h> +#include <asm/page.h> #include <asm/reboot.h> +#include <asm/sections.h> #include <asm/time.h> #include <asm/traps.h> #include <asm/wbflush.h> @@ -146,6 +150,9 @@ void __init plat_mem_setup(void) ioport_resource.start = ~0UL; ioport_resource.end = 0UL; + + /* Stay away from the firmware working memory area for now. */ + memblock_reserve(PHYS_OFFSET, __pa_symbol(&_text) - PHYS_OFFSET); } /* diff --git a/arch/mips/fw/arc/memory.c b/arch/mips/fw/arc/memory.c index da0712ad85f5..37625ae5e35d 100644 --- a/arch/mips/fw/arc/memory.c +++ b/arch/mips/fw/arc/memory.c @@ -68,20 +68,24 @@ static char *arc_mtypes[8] = { : arc_mtypes[a.arc] #endif +enum { + mem_free, mem_prom_used, mem_reserved +}; + static inline int memtype_classify_arcs(union linux_memtypes type) { switch (type.arcs) { case arcs_fcontig: case arcs_free: - return BOOT_MEM_RAM; + return mem_free; case arcs_atmp: - return BOOT_MEM_ROM_DATA; + return mem_prom_used; case arcs_eblock: case arcs_rvpage: case arcs_bmem: case arcs_prog: case arcs_aperm: - return BOOT_MEM_RESERVED; + return mem_reserved; default: BUG(); } @@ -93,15 +97,15 @@ static inline int memtype_classify_arc(union linux_memtypes type) switch (type.arc) { case arc_free: case arc_fcontig: - return BOOT_MEM_RAM; + return mem_free; case arc_atmp: - return BOOT_MEM_ROM_DATA; + return mem_prom_used; case arc_eblock: case arc_rvpage: case arc_bmem: case arc_prog: case arc_aperm: - return BOOT_MEM_RESERVED; + return mem_reserved; default: BUG(); } @@ -143,9 +147,17 @@ void __weak __init prom_meminit(void) size = p->pages << ARC_PAGE_SHIFT; type = prom_memtype_classify(p->type); - add_memory_region(base, size, type); + /* ignore mirrored RAM on IP28/IP30 */ + if (base < PHYS_OFFSET) + continue; + + memblock_add(base, size); + + if (type == mem_reserved) + memblock_reserve(base, size); - if (type == BOOT_MEM_ROM_DATA) { + if (type == mem_prom_used) { + memblock_reserve(base, size); if (nr_prom_mem >= 5) { pr_err("Too many ROM DATA regions"); continue; diff --git a/arch/mips/fw/sni/sniprom.c b/arch/mips/fw/sni/sniprom.c index 80112f2298b6..8f6730376a42 100644 --- a/arch/mips/fw/sni/sniprom.c +++ b/arch/mips/fw/sni/sniprom.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/init.h> +#include <linux/memblock.h> #include <linux/string.h> #include <linux/console.h> @@ -131,8 +132,7 @@ static void __init sni_mem_init(void) } pr_debug("Bank%d: %08x @ %08x\n", i, memconf[i].size, memconf[i].base); - add_memory_region(memconf[i].base, memconf[i].size, - BOOT_MEM_RAM); + memblock_add(memconf[i].base, memconf[i].size); } } diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index fd6019802657..55d9aed7ced9 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -if MIPS_GENERIC +if MIPS_GENERIC_KERNEL config LEGACY_BOARDS bool @@ -73,6 +73,12 @@ config FIT_IMAGE_FDT_OCELOT from Microsemi in the FIT kernel image. This requires u-boot on the platform. +config BOARD_INGENIC + bool "Support boards based on Ingenic SoCs" + select MACH_INGENIC_GENERIC + help + Enable support for boards based on Ingenic SoCs. + config VIRT_BOARD_RANCHU bool "Support Ranchu platform for Android emulator" help diff --git a/arch/mips/generic/Makefile b/arch/mips/generic/Makefile index 2384a6b09e4c..e37a59bae0a6 100644 --- a/arch/mips/generic/Makefile +++ b/arch/mips/generic/Makefile @@ -11,4 +11,5 @@ obj-y += proc.o obj-$(CONFIG_YAMON_DT_SHIM) += yamon-dt.o obj-$(CONFIG_LEGACY_BOARD_SEAD3) += board-sead3.o obj-$(CONFIG_LEGACY_BOARD_OCELOT) += board-ocelot.o +obj-$(CONFIG_MACH_INGENIC) += board-ingenic.o obj-$(CONFIG_VIRT_BOARD_RANCHU) += board-ranchu.o diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform index 53c33cb72974..f8ef2f9d107e 100644 --- a/arch/mips/generic/Platform +++ b/arch/mips/generic/Platform @@ -8,8 +8,12 @@ # option) any later version. # +# Note: order matters, keep the asm/mach-generic include last. +cflags-$(CONFIG_MACH_INGENIC_SOC) += -I$(srctree)/arch/mips/include/asm/mach-ingenic cflags-$(CONFIG_MIPS_GENERIC) += -I$(srctree)/arch/mips/include/asm/mach-generic + load-$(CONFIG_MIPS_GENERIC) += 0xffffffff80100000 +zload-$(CONFIG_MIPS_GENERIC) += 0xffffffff81000000 all-$(CONFIG_MIPS_GENERIC) := vmlinux.gz.itb its-y := vmlinux.its.S diff --git a/arch/mips/generic/board-ingenic.c b/arch/mips/generic/board-ingenic.c new file mode 100644 index 000000000000..0cec0bea13d6 --- /dev/null +++ b/arch/mips/generic/board-ingenic.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Support for Ingenic SoCs + * + * Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de> + * Copyright (C) 2011, Maarten ter Huurne <maarten@treewalker.org> + * Copyright (C) 2020 Paul Cercueil <paul@crapouillou.net> + */ + +#include <linux/of_address.h> +#include <linux/of_fdt.h> +#include <linux/pm.h> +#include <linux/sizes.h> +#include <linux/suspend.h> +#include <linux/types.h> + +#include <asm/bootinfo.h> +#include <asm/machine.h> +#include <asm/reboot.h> + +static __init char *ingenic_get_system_type(unsigned long machtype) +{ + switch (machtype) { + case MACH_INGENIC_X2000E: + return "X2000E"; + case MACH_INGENIC_X2000: + return "X2000"; + case MACH_INGENIC_X1830: + return "X1830"; + case MACH_INGENIC_X1000E: + return "X1000E"; + case MACH_INGENIC_X1000: + return "X1000"; + case MACH_INGENIC_JZ4780: + return "JZ4780"; + case MACH_INGENIC_JZ4775: + return "JZ4775"; + case MACH_INGENIC_JZ4770: + return "JZ4770"; + case MACH_INGENIC_JZ4725B: + return "JZ4725B"; + default: + return "JZ4740"; + } +} + +static __init const void *ingenic_fixup_fdt(const void *fdt, const void *match_data) +{ + /* + * Old devicetree files for the qi,lb60 board did not have a /memory + * node. Hardcode the memory info here. + */ + if (!fdt_node_check_compatible(fdt, 0, "qi,lb60") && + fdt_path_offset(fdt, "/memory") < 0) + early_init_dt_add_memory_arch(0, SZ_32M); + + mips_machtype = (unsigned long)match_data; + system_type = ingenic_get_system_type(mips_machtype); + + return fdt; +} + +static const struct of_device_id ingenic_of_match[] __initconst = { + { .compatible = "ingenic,jz4740", .data = (void *)MACH_INGENIC_JZ4740 }, + { .compatible = "ingenic,jz4725b", .data = (void *)MACH_INGENIC_JZ4725B }, + { .compatible = "ingenic,jz4770", .data = (void *)MACH_INGENIC_JZ4770 }, + { .compatible = "ingenic,jz4775", .data = (void *)MACH_INGENIC_JZ4775 }, + { .compatible = "ingenic,jz4780", .data = (void *)MACH_INGENIC_JZ4780 }, + { .compatible = "ingenic,x1000", .data = (void *)MACH_INGENIC_X1000 }, + { .compatible = "ingenic,x1000e", .data = (void *)MACH_INGENIC_X1000E }, + { .compatible = "ingenic,x1830", .data = (void *)MACH_INGENIC_X1830 }, + { .compatible = "ingenic,x2000", .data = (void *)MACH_INGENIC_X2000 }, + { .compatible = "ingenic,x2000e", .data = (void *)MACH_INGENIC_X2000E }, + {} +}; + +MIPS_MACHINE(ingenic) = { + .matches = ingenic_of_match, + .fixup_fdt = ingenic_fixup_fdt, +}; + +static void ingenic_wait_instr(void) +{ + __asm__(".set push;\n" + ".set mips3;\n" + "wait;\n" + ".set pop;\n" + ); +} + +static void ingenic_halt(void) +{ + for (;;) + ingenic_wait_instr(); +} + +static int __maybe_unused ingenic_pm_enter(suspend_state_t state) +{ + ingenic_wait_instr(); + + return 0; +} + +static const struct platform_suspend_ops ingenic_pm_ops __maybe_unused = { + .valid = suspend_valid_only_mem, + .enter = ingenic_pm_enter, +}; + +static int __init ingenic_pm_init(void) +{ + if (boot_cpu_type() == CPU_XBURST) { + if (IS_ENABLED(CONFIG_PM_SLEEP)) + suspend_set_ops(&ingenic_pm_ops); + _machine_halt = ingenic_halt; + } + + return 0; + +} +late_initcall(ingenic_pm_init); diff --git a/arch/mips/generic/init.c b/arch/mips/generic/init.c index 805d0135a9f4..66a19337d2ab 100644 --- a/arch/mips/generic/init.c +++ b/arch/mips/generic/init.c @@ -39,12 +39,11 @@ void __init *plat_get_fdt(void) /* Already set up */ return (void *)fdt; - if ((fw_arg0 == -2) && !fdt_check_header((void *)fw_passed_dtb)) { + if (fw_passed_dtb && !fdt_check_header((void *)fw_passed_dtb)) { /* - * We booted using the UHI boot protocol, so we have been - * provided with the appropriate device tree for the board. - * Make use of it & search for any machine struct based upon - * the root compatible string. + * We have been provided with the appropriate device tree for + * the board. Make use of it & search for any machine struct + * based upon the root compatible string. */ fdt = (void *)fw_passed_dtb; @@ -106,7 +105,7 @@ void __init plat_mem_setup(void) if (mach && mach->fixup_fdt) fdt = mach->fixup_fdt(fdt, mach_match_data); - strlcpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE); + fw_init_cmdline(); __dt_setup_arch((void *)fdt); } diff --git a/arch/mips/generic/proc.c b/arch/mips/generic/proc.c index 4c992809cc3f..cce2fde219a3 100644 --- a/arch/mips/generic/proc.c +++ b/arch/mips/generic/proc.c @@ -8,11 +8,16 @@ #include <asm/bootinfo.h> +char *system_type; + const char *get_system_type(void) { const char *str; int err; + if (system_type) + return system_type; + err = of_property_read_string(of_root, "model", &str); if (!err) return str; diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h index 147c9327ce04..aa03b1237155 100644 --- a/arch/mips/include/asm/bootinfo.h +++ b/arch/mips/include/asm/bootinfo.h @@ -79,8 +79,10 @@ enum ingenic_machine_type { MACH_INGENIC_JZ4775, MACH_INGENIC_JZ4780, MACH_INGENIC_X1000, + MACH_INGENIC_X1000E, MACH_INGENIC_X1830, MACH_INGENIC_X2000, + MACH_INGENIC_X2000E, }; extern char *system_type; @@ -88,13 +90,6 @@ const char *get_system_type(void); extern unsigned long mips_machtype; -#define BOOT_MEM_RAM 1 -#define BOOT_MEM_ROM_DATA 2 -#define BOOT_MEM_RESERVED 3 -#define BOOT_MEM_INIT_RAM 4 -#define BOOT_MEM_NOMAP 5 - -extern void add_memory_region(phys_addr_t start, phys_addr_t size, long type); extern void detect_memory_region(phys_addr_t start, phys_addr_t sz_min, phys_addr_t sz_max); extern void prom_init(void); diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h index 78cf7e300f12..f2e216eef7da 100644 --- a/arch/mips/include/asm/cpu-features.h +++ b/arch/mips/include/asm/cpu-features.h @@ -171,9 +171,6 @@ #ifndef cpu_has_llsc #define cpu_has_llsc __isa_ge_or_opt(1, MIPS_CPU_LLSC) #endif -#ifndef cpu_has_bp_ghist -#define cpu_has_bp_ghist __opt(MIPS_CPU_BP_GHIST) -#endif #ifndef kernel_uses_llsc #define kernel_uses_llsc cpu_has_llsc #endif diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index 388a82f28a87..c9222cc2244f 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -398,7 +398,6 @@ enum cpu_type_enum { #define MIPS_CPU_RW_LLB BIT_ULL(32) /* LLADDR/LLB writes are allowed */ #define MIPS_CPU_LPA BIT_ULL(33) /* CPU supports Large Physical Addressing */ #define MIPS_CPU_CDMM BIT_ULL(34) /* CPU has Common Device Memory Map */ -#define MIPS_CPU_BP_GHIST BIT_ULL(35) /* R12K+ Branch Prediction Global History */ #define MIPS_CPU_SP BIT_ULL(36) /* Small (1KB) page support */ #define MIPS_CPU_FTLB BIT_ULL(37) /* CPU has Fixed-page-size TLB */ #define MIPS_CPU_NAN_LEGACY BIT_ULL(38) /* Legacy NaN implemented */ diff --git a/arch/mips/include/asm/dma-direct.h b/arch/mips/include/asm/dma-direct.h index 14e352651ce9..9a640118316c 100644 --- a/arch/mips/include/asm/dma-direct.h +++ b/arch/mips/include/asm/dma-direct.h @@ -2,7 +2,7 @@ #ifndef _MIPS_DMA_DIRECT_H #define _MIPS_DMA_DIRECT_H 1 -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr); -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr); +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr); #endif /* _MIPS_DMA_DIRECT_H */ diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 2bf8f6014579..d85248404c52 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -21,7 +21,7 @@ #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ { \ - if (cpu_has_llsc && R10000_LLSC_WAR) { \ + if (cpu_has_llsc && IS_ENABLED(CONFIG_WAR_R10000_LLSC)) { \ __asm__ __volatile__( \ " .set push \n" \ " .set noat \n" \ @@ -133,7 +133,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; - if (cpu_has_llsc && R10000_LLSC_WAR) { + if (cpu_has_llsc && IS_ENABLED(CONFIG_WAR_R10000_LLSC)) { __asm__ __volatile__( "# futex_atomic_cmpxchg_inatomic \n" " .set push \n" diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h index 655a6dbc861a..0992cad9c632 100644 --- a/arch/mips/include/asm/idle.h +++ b/arch/mips/include/asm/idle.h @@ -15,6 +15,8 @@ static inline int using_rollback_handler(void) return cpu_wait == r4k_wait; } +extern void __init check_wait(void); + extern int mips_cpuidle_wait_enter(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); diff --git a/arch/mips/include/asm/jazzdma.h b/arch/mips/include/asm/jazzdma.h index d13f940022d5..c831da7fa898 100644 --- a/arch/mips/include/asm/jazzdma.h +++ b/arch/mips/include/asm/jazzdma.h @@ -10,8 +10,6 @@ */ extern unsigned long vdma_alloc(unsigned long paddr, unsigned long size); extern int vdma_free(unsigned long laddr); -extern int vdma_remap(unsigned long laddr, unsigned long paddr, - unsigned long size); extern unsigned long vdma_phys2log(unsigned long paddr); extern unsigned long vdma_log2phys(unsigned long laddr); extern void vdma_stats(void); /* for debugging only */ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 825d337a505a..24f3d0f9996b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -341,7 +341,7 @@ struct kvm_mips_tlb { #define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { void *guest_ebase; - int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + int (*vcpu_run)(struct kvm_vcpu *vcpu); /* Host registers preserved across guest mode execution */ unsigned long host_stack; @@ -852,7 +852,7 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); -extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu); /* Building of entry/exception code */ int kvm_mips_entry_setup(void); diff --git a/arch/mips/include/asm/llsc.h b/arch/mips/include/asm/llsc.h index c49738bc3bda..ec09fe5d6d6c 100644 --- a/arch/mips/include/asm/llsc.h +++ b/arch/mips/include/asm/llsc.h @@ -28,7 +28,7 @@ * works around a bug present in R10000 CPUs prior to revision 3.0 that could * cause ll-sc sequences to execute non-atomically. */ -#if R10000_LLSC_WAR +#ifdef CONFIG_WAR_R10000_LLSC # define __SC_BEQZ "beqzl " #elif MIPS_ISA_REV >= 6 # define __SC_BEQZ "beqzc " diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h index fef0fda8f82f..ecda7295ddcd 100644 --- a/arch/mips/include/asm/local.h +++ b/arch/mips/include/asm/local.h @@ -31,7 +31,7 @@ static __inline__ long local_add_return(long i, local_t * l) { unsigned long result; - if (kernel_uses_llsc && R10000_LLSC_WAR) { + if (kernel_uses_llsc && IS_ENABLED(CONFIG_WAR_R10000_LLSC)) { unsigned long temp; __asm__ __volatile__( @@ -80,7 +80,7 @@ static __inline__ long local_sub_return(long i, local_t * l) { unsigned long result; - if (kernel_uses_llsc && R10000_LLSC_WAR) { + if (kernel_uses_llsc && IS_ENABLED(CONFIG_WAR_R10000_LLSC)) { unsigned long temp; __asm__ __volatile__( diff --git a/arch/mips/include/asm/m48t37.h b/arch/mips/include/asm/m48t37.h deleted file mode 100644 index 3687a02e692b..000000000000 --- a/arch/mips/include/asm/m48t37.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Registers for the SGS-Thomson M48T37 Timekeeper RAM chip - */ -#ifndef _ASM_M48T37_H -#define _ASM_M48T37_H - -#include <linux/spinlock.h> - -extern spinlock_t rtc_lock; - -struct m48t37_rtc { - volatile u8 pad[0x7ff0]; /* NVRAM */ - volatile u8 flags; - volatile u8 century; - volatile u8 alarm_sec; - volatile u8 alarm_min; - volatile u8 alarm_hour; - volatile u8 alarm_data; - volatile u8 interrupts; - volatile u8 watchdog; - volatile u8 control; - volatile u8 sec; - volatile u8 min; - volatile u8 hour; - volatile u8 day; - volatile u8 date; - volatile u8 month; - volatile u8 year; -}; - -#define M48T37_RTC_SET 0x80 -#define M48T37_RTC_STOPPED 0x80 -#define M48T37_RTC_READ 0x40 - -#endif /* _ASM_M48T37_H */ diff --git a/arch/mips/include/asm/mach-au1x00/cpu-feature-overrides.h b/arch/mips/include/asm/mach-au1x00/cpu-feature-overrides.h index ecfbb5aeada3..e6e527224a15 100644 --- a/arch/mips/include/asm/mach-au1x00/cpu-feature-overrides.h +++ b/arch/mips/include/asm/mach-au1x00/cpu-feature-overrides.h @@ -39,7 +39,6 @@ #define cpu_has_guestctl2 0 #define cpu_has_guestid 0 #define cpu_has_drg 0 -#define cpu_has_bp_ghist 0 #define cpu_has_mips16 0 #define cpu_has_mips16e2 0 #define cpu_has_mdmx 0 diff --git a/arch/mips/include/asm/mach-au1x00/gpio-au1300.h b/arch/mips/include/asm/mach-au1x00/gpio-au1300.h index d25846a1291f..d16add7ba49d 100644 --- a/arch/mips/include/asm/mach-au1x00/gpio-au1300.h +++ b/arch/mips/include/asm/mach-au1x00/gpio-au1300.h @@ -120,141 +120,4 @@ static inline int au1300_gpio_getinitlvl(unsigned int gpio) return (v >> gpio) & 1; } -/**********************************************************************/ - -/* Linux gpio framework integration. -* -* 4 use cases of Alchemy GPIOS: -*(1) GPIOLIB=y, ALCHEMY_GPIO_INDIRECT=y: -* Board must register gpiochips. -*(2) GPIOLIB=y, ALCHEMY_GPIO_INDIRECT=n: -* A gpiochip for the 75 GPIOs is registered. -* -*(3) GPIOLIB=n, ALCHEMY_GPIO_INDIRECT=y: -* the boards' gpio.h must provide the linux gpio wrapper functions, -* -*(4) GPIOLIB=n, ALCHEMY_GPIO_INDIRECT=n: -* inlinable gpio functions are provided which enable access to the -* Au1300 gpios only by using the numbers straight out of the data- -* sheets. - -* Cases 1 and 3 are intended for boards which want to provide their own -* GPIO namespace and -operations (i.e. for example you have 8 GPIOs -* which are in part provided by spare Au1300 GPIO pins and in part by -* an external FPGA but you still want them to be accessible in linux -* as gpio0-7. The board can of course use the alchemy_gpioX_* functions -* as required). -*/ - -#ifndef CONFIG_GPIOLIB - -#ifdef CONFIG_ALCHEMY_GPIOINT_AU1300 - -#ifndef CONFIG_ALCHEMY_GPIO_INDIRECT /* case (4) */ - -static inline int gpio_direction_input(unsigned int gpio) -{ - return au1300_gpio_direction_input(gpio); -} - -static inline int gpio_direction_output(unsigned int gpio, int v) -{ - return au1300_gpio_direction_output(gpio, v); -} - -static inline int gpio_get_value(unsigned int gpio) -{ - return au1300_gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int v) -{ - au1300_gpio_set_value(gpio, v); -} - -static inline int gpio_get_value_cansleep(unsigned gpio) -{ - return gpio_get_value(gpio); -} - -static inline void gpio_set_value_cansleep(unsigned gpio, int value) -{ - gpio_set_value(gpio, value); -} - -static inline int gpio_is_valid(unsigned int gpio) -{ - return au1300_gpio_is_valid(gpio); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return au1300_gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return au1300_gpio_to_irq(gpio); -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return au1300_irq_to_gpio(irq); -} - -static inline int gpio_request(unsigned int gpio, const char *label) -{ - return 0; -} - -static inline int gpio_request_one(unsigned gpio, - unsigned long flags, const char *label) -{ - return 0; -} - -static inline int gpio_request_array(struct gpio *array, size_t num) -{ - return 0; -} - -static inline void gpio_free(unsigned gpio) -{ -} - -static inline void gpio_free_array(struct gpio *array, size_t num) -{ -} - -static inline int gpio_set_debounce(unsigned gpio, unsigned debounce) -{ - return -ENOSYS; -} - -static inline void gpio_unexport(unsigned gpio) -{ -} - -static inline int gpio_export(unsigned gpio, bool direction_may_change) -{ - return -ENOSYS; -} - -static inline int gpio_sysfs_set_active_low(unsigned gpio, int value) -{ - return -ENOSYS; -} - -static inline int gpio_export_link(struct device *dev, const char *name, - unsigned gpio) -{ - return -ENOSYS; -} - -#endif /* !CONFIG_ALCHEMY_GPIO_INDIRECT */ - -#endif /* CONFIG_ALCHEMY_GPIOINT_AU1300 */ - -#endif /* CONFIG GPIOLIB */ - #endif /* _GPIO_AU1300_H_ */ diff --git a/arch/mips/include/asm/mach-bcm47xx/bcm47xx.h b/arch/mips/include/asm/mach-bcm47xx/bcm47xx.h index d7f1ef246d5c..93817bfb7fb2 100644 --- a/arch/mips/include/asm/mach-bcm47xx/bcm47xx.h +++ b/arch/mips/include/asm/mach-bcm47xx/bcm47xx.h @@ -10,6 +10,7 @@ #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_soc.h> #include <linux/bcm47xx_nvram.h> +#include <linux/bcm47xx_sprom.h> enum bcm47xx_bus_type { #ifdef CONFIG_BCM47XX_SSB @@ -32,9 +33,6 @@ union bcm47xx_bus { extern union bcm47xx_bus bcm47xx_bus; extern enum bcm47xx_bus_type bcm47xx_bus_type; -void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix, - bool fallback); - void bcm47xx_set_system_type(u16 chip_id); #endif /* __ASM_BCM47XX_H */ diff --git a/arch/mips/include/asm/mach-cavium-octeon/war.h b/arch/mips/include/asm/mach-cavium-octeon/war.h deleted file mode 100644 index 2421411b7636..000000000000 --- a/arch/mips/include/asm/mach-cavium-octeon/war.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - * Copyright (C) 2008 Cavium Networks <support@caviumnetworks.com> - */ -#ifndef __ASM_MIPS_MACH_CAVIUM_OCTEON_WAR_H -#define __ASM_MIPS_MACH_CAVIUM_OCTEON_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#define CAVIUM_OCTEON_DCACHE_PREFETCH_WAR \ - OCTEON_IS_MODEL(OCTEON_CN6XXX) - -#endif /* __ASM_MIPS_MACH_CAVIUM_OCTEON_WAR_H */ diff --git a/arch/mips/include/asm/mach-generic/irq.h b/arch/mips/include/asm/mach-generic/irq.h index 72ac2c202c55..079889ced4f3 100644 --- a/arch/mips/include/asm/mach-generic/irq.h +++ b/arch/mips/include/asm/mach-generic/irq.h @@ -9,7 +9,7 @@ #define __ASM_MACH_GENERIC_IRQ_H #ifndef NR_IRQS -#define NR_IRQS 128 +#define NR_IRQS 256 #endif #ifdef CONFIG_I8259 diff --git a/arch/mips/include/asm/mach-generic/war.h b/arch/mips/include/asm/mach-generic/war.h deleted file mode 100644 index f0f4a35d0870..000000000000 --- a/arch/mips/include/asm/mach-generic/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MACH_GENERIC_WAR_H -#define __ASM_MACH_GENERIC_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MACH_GENERIC_WAR_H */ diff --git a/arch/mips/include/asm/mach-jz4740/cpu-feature-overrides.h b/arch/mips/include/asm/mach-ingenic/cpu-feature-overrides.h index 7c5e576f9d96..7c5e576f9d96 100644 --- a/arch/mips/include/asm/mach-jz4740/cpu-feature-overrides.h +++ b/arch/mips/include/asm/mach-ingenic/cpu-feature-overrides.h diff --git a/arch/mips/include/asm/mach-ip22/war.h b/arch/mips/include/asm/mach-ip22/war.h deleted file mode 100644 index b48eb4ac362d..000000000000 --- a/arch/mips/include/asm/mach-ip22/war.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_IP22_WAR_H -#define __ASM_MIPS_MACH_IP22_WAR_H - -/* - * R4600 CPU modules for the Indy come with both V1.7 and V2.0 processors. - */ - -#define R4600_V1_INDEX_ICACHEOP_WAR 1 -#define R4600_V1_HIT_CACHEOP_WAR 1 -#define R4600_V2_HIT_CACHEOP_WAR 1 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_IP22_WAR_H */ diff --git a/arch/mips/include/asm/mach-ip27/kmalloc.h b/arch/mips/include/asm/mach-ip27/kmalloc.h deleted file mode 100644 index 82c23ce2afa7..000000000000 --- a/arch/mips/include/asm/mach-ip27/kmalloc.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ASM_MACH_IP27_KMALLOC_H -#define __ASM_MACH_IP27_KMALLOC_H - -/* - * All happy, no need to define ARCH_DMA_MINALIGN - */ - -#endif /* __ASM_MACH_IP27_KMALLOC_H */ diff --git a/arch/mips/include/asm/mach-ip27/war.h b/arch/mips/include/asm/mach-ip27/war.h deleted file mode 100644 index ef3efce0094a..000000000000 --- a/arch/mips/include/asm/mach-ip27/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_IP27_WAR_H -#define __ASM_MIPS_MACH_IP27_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 1 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_IP27_WAR_H */ diff --git a/arch/mips/include/asm/mach-ip28/cpu-feature-overrides.h b/arch/mips/include/asm/mach-ip28/cpu-feature-overrides.h index ba8b4e30b3e2..613bbc10c1f2 100644 --- a/arch/mips/include/asm/mach-ip28/cpu-feature-overrides.h +++ b/arch/mips/include/asm/mach-ip28/cpu-feature-overrides.h @@ -25,7 +25,7 @@ #define cpu_has_mcheck 0 #define cpu_has_ejtag 0 -#define cpu_has_llsc 1 +#define cpu_has_llsc 0 #define cpu_has_vtag_icache 0 #define cpu_has_dc_aliases 0 /* see probe_pcache() */ #define cpu_has_ic_fills_f_dc 0 diff --git a/arch/mips/include/asm/mach-ip28/war.h b/arch/mips/include/asm/mach-ip28/war.h deleted file mode 100644 index 61cd67354829..000000000000 --- a/arch/mips/include/asm/mach-ip28/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_IP28_WAR_H -#define __ASM_MIPS_MACH_IP28_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 1 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_IP28_WAR_H */ diff --git a/arch/mips/include/asm/mach-ip30/irq.h b/arch/mips/include/asm/mach-ip30/irq.h deleted file mode 100644 index 27ba899c95be..000000000000 --- a/arch/mips/include/asm/mach-ip30/irq.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * HEART IRQ defines - * - * Copyright (C) 2009 Johannes Dickgreber <tanzy@gmx.de> - * 2014-2016 Joshua Kinard <kumba@gentoo.org> - * - */ - -#ifndef __ASM_MACH_IP30_IRQ_H -#define __ASM_MACH_IP30_IRQ_H - -/* - * HEART has 64 hardware interrupts, but use 128 to leave room for a few - * software interrupts as well (such as the CPU timer interrupt. - */ -#define NR_IRQS 128 - -extern void __init ip30_install_ipi(void); - -/* - * HEART has 64 interrupt vectors available to it, subdivided into five - * priority levels. They are numbered 0 to 63. - */ -#define HEART_NUM_IRQS 64 - -/* - * These are the five interrupt priority levels and their corresponding - * CPU IPx interrupt pins. - * - * Level 4 - Error Interrupts. - * Level 3 - HEART timer interrupt. - * Level 2 - CPU IPI, CPU debug, power putton, general device interrupts. - * Level 1 - General device interrupts. - * Level 0 - General device GFX flow control interrupts. - */ -#define HEART_L4_INT_MASK 0xfff8000000000000ULL /* IP6 */ -#define HEART_L3_INT_MASK 0x0004000000000000ULL /* IP5 */ -#define HEART_L2_INT_MASK 0x0003ffff00000000ULL /* IP4 */ -#define HEART_L1_INT_MASK 0x00000000ffff0000ULL /* IP3 */ -#define HEART_L0_INT_MASK 0x000000000000ffffULL /* IP2 */ - -/* HEART L0 Interrupts (Low Priority) */ -#define HEART_L0_INT_GENERIC 0 -#define HEART_L0_INT_FLOW_CTRL_HWTR_0 1 -#define HEART_L0_INT_FLOW_CTRL_HWTR_1 2 - -/* HEART L2 Interrupts (High Priority) */ -#define HEART_L2_INT_RESCHED_CPU_0 46 -#define HEART_L2_INT_RESCHED_CPU_1 47 -#define HEART_L2_INT_CALL_CPU_0 48 -#define HEART_L2_INT_CALL_CPU_1 49 - -/* HEART L3 Interrupts (Compare/Counter Timer) */ -#define HEART_L3_INT_TIMER 50 - -/* HEART L4 Interrupts (Errors) */ -#define HEART_L4_INT_XWID_ERR_9 51 -#define HEART_L4_INT_XWID_ERR_A 52 -#define HEART_L4_INT_XWID_ERR_B 53 -#define HEART_L4_INT_XWID_ERR_C 54 -#define HEART_L4_INT_XWID_ERR_D 55 -#define HEART_L4_INT_XWID_ERR_E 56 -#define HEART_L4_INT_XWID_ERR_F 57 -#define HEART_L4_INT_XWID_ERR_XBOW 58 -#define HEART_L4_INT_CPU_BUS_ERR_0 59 -#define HEART_L4_INT_CPU_BUS_ERR_1 60 -#define HEART_L4_INT_CPU_BUS_ERR_2 61 -#define HEART_L4_INT_CPU_BUS_ERR_3 62 -#define HEART_L4_INT_HEART_EXCP 63 - -/* - * Power Switch is wired via BaseIO BRIDGE slot #6. - * - * ACFail is wired via BaseIO BRIDGE slot #7. - */ -#define IP30_POWER_IRQ HEART_L2_INT_POWER_BTN - -#include <asm/mach-generic/irq.h> - -#define IP30_HEART_L0_IRQ (MIPS_CPU_IRQ_BASE + 2) -#define IP30_HEART_L1_IRQ (MIPS_CPU_IRQ_BASE + 3) -#define IP30_HEART_L2_IRQ (MIPS_CPU_IRQ_BASE + 4) -#define IP30_HEART_TIMER_IRQ (MIPS_CPU_IRQ_BASE + 5) -#define IP30_HEART_ERR_IRQ (MIPS_CPU_IRQ_BASE + 6) - -#endif /* __ASM_MACH_IP30_IRQ_H */ diff --git a/arch/mips/include/asm/mach-ip30/war.h b/arch/mips/include/asm/mach-ip30/war.h deleted file mode 100644 index a1fa0c1f5300..000000000000 --- a/arch/mips/include/asm/mach-ip30/war.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_IP30_WAR_H -#define __ASM_MIPS_MACH_IP30_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#ifdef CONFIG_CPU_R10000 -#define R10000_LLSC_WAR 1 -#else -#define R10000_LLSC_WAR 0 -#endif -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_IP30_WAR_H */ diff --git a/arch/mips/include/asm/mach-ip32/war.h b/arch/mips/include/asm/mach-ip32/war.h deleted file mode 100644 index e77b9d1b6c96..000000000000 --- a/arch/mips/include/asm/mach-ip32/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_IP32_WAR_H -#define __ASM_MIPS_MACH_IP32_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 1 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_IP32_WAR_H */ diff --git a/arch/mips/include/asm/mach-jz4740/irq.h b/arch/mips/include/asm/mach-jz4740/irq.h deleted file mode 100644 index 27c543bd340f..000000000000 --- a/arch/mips/include/asm/mach-jz4740/irq.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de> - * JZ4740 IRQ definitions - */ - -#ifndef __ASM_MACH_JZ4740_IRQ_H__ -#define __ASM_MACH_JZ4740_IRQ_H__ - -#define MIPS_CPU_IRQ_BASE 0 -#define NR_IRQS 256 - -#endif diff --git a/arch/mips/include/asm/mach-loongson2ef/mc146818rtc.h b/arch/mips/include/asm/mach-loongson2ef/mc146818rtc.h deleted file mode 100644 index 00d602629a55..000000000000 --- a/arch/mips/include/asm/mach-loongson2ef/mc146818rtc.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1998, 2001, 03, 07 by Ralf Baechle (ralf@linux-mips.org) - * - * RTC routines for PC style attached Dallas chip. - */ -#ifndef __ASM_MACH_LOONGSON2EF_MC146818RTC_H -#define __ASM_MACH_LOONGSON2EF_MC146818RTC_H - -#include <linux/io.h> - -#define RTC_PORT(x) (0x70 + (x)) -#define RTC_IRQ 8 - -static inline unsigned char CMOS_READ(unsigned long addr) -{ - outb_p(addr, RTC_PORT(0)); - return inb_p(RTC_PORT(1)); -} - -static inline void CMOS_WRITE(unsigned char data, unsigned long addr) -{ - outb_p(addr, RTC_PORT(0)); - outb_p(data, RTC_PORT(1)); -} - -#define RTC_ALWAYS_BCD 0 - -#ifndef mc146818_decode_year -#define mc146818_decode_year(year) ((year) < 70 ? (year) + 2000 : (year) + 1970) -#endif - -#endif /* __ASM_MACH_LOONGSON2EF_MC146818RTC_H */ diff --git a/arch/mips/include/asm/mach-loongson64/irq.h b/arch/mips/include/asm/mach-loongson64/irq.h index bf2480923154..98ea977cf0b8 100644 --- a/arch/mips/include/asm/mach-loongson64/irq.h +++ b/arch/mips/include/asm/mach-loongson64/irq.h @@ -5,7 +5,8 @@ /* cpu core interrupt numbers */ #define NR_IRQS_LEGACY 16 #define NR_MIPS_CPU_IRQS 8 -#define NR_IRQS (NR_IRQS_LEGACY + NR_MIPS_CPU_IRQS + 256) +#define NR_MAX_CHAINED_IRQS 40 /* Chained IRQs means those not directly used by devices */ +#define NR_IRQS (NR_IRQS_LEGACY + NR_MIPS_CPU_IRQS + NR_MAX_CHAINED_IRQS + 256) #define MIPS_CPU_IRQ_BASE NR_IRQS_LEGACY diff --git a/arch/mips/include/asm/mach-loongson64/mmzone.h b/arch/mips/include/asm/mach-loongson64/mmzone.h index 5eaca4fe3f92..ebb1deaa77b9 100644 --- a/arch/mips/include/asm/mach-loongson64/mmzone.h +++ b/arch/mips/include/asm/mach-loongson64/mmzone.h @@ -10,13 +10,9 @@ #define _ASM_MACH_LOONGSON64_MMZONE_H #define NODE_ADDRSPACE_SHIFT 44 -#define NODE0_ADDRSPACE_OFFSET 0x000000000000UL -#define NODE1_ADDRSPACE_OFFSET 0x100000000000UL -#define NODE2_ADDRSPACE_OFFSET 0x200000000000UL -#define NODE3_ADDRSPACE_OFFSET 0x300000000000UL #define pa_to_nid(addr) (((addr) & 0xf00000000000) >> NODE_ADDRSPACE_SHIFT) -#define nid_to_addrbase(nid) ((nid) << NODE_ADDRSPACE_SHIFT) +#define nid_to_addrbase(nid) ((unsigned long)(nid) << NODE_ADDRSPACE_SHIFT) extern struct pglist_data *__node_data[]; diff --git a/arch/mips/include/asm/mach-malta/malta-dtshim.h b/arch/mips/include/asm/mach-malta/malta-dtshim.h deleted file mode 100644 index 7c97b710121d..000000000000 --- a/arch/mips/include/asm/mach-malta/malta-dtshim.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2015 Imagination Technologies - * Author: Paul Burton <paul.burton@mips.com> - */ - -#ifndef __MIPS_MALTA_DTSHIM_H__ -#define __MIPS_MALTA_DTSHIM_H__ - -#include <linux/init.h> - -#ifdef CONFIG_MIPS_MALTA - -extern void __init *malta_dt_shim(void *fdt); - -#else /* !CONFIG_MIPS_MALTA */ - -static inline void *malta_dt_shim(void *fdt) -{ - return fdt; -} - -#endif /* !CONFIG_MIPS_MALTA */ - -#endif /* __MIPS_MALTA_DTSHIM_H__ */ diff --git a/arch/mips/include/asm/mach-malta/malta-pm.h b/arch/mips/include/asm/mach-malta/malta-pm.h deleted file mode 100644 index 2a5146d79313..000000000000 --- a/arch/mips/include/asm/mach-malta/malta-pm.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2014 Imagination Technologies - * Author: Paul Burton <paul.burton@mips.com> - */ - -#ifndef __ASM_MIPS_MACH_MALTA_PM_H__ -#define __ASM_MIPS_MACH_MALTA_PM_H__ - -#include <asm/mips-boards/piix4.h> - -#ifdef CONFIG_MIPS_MALTA_PM - -/** - * mips_pm_suspend - enter a suspend state - * @state: the state to enter, one of PIIX4_FUNC3IO_PMCNTRL_SUS_TYP_* - * - * Enters a suspend state via the Malta's PIIX4. If the state to be entered - * is one which loses context (eg. SOFF) then this function will never - * return. - */ -extern int mips_pm_suspend(unsigned state); - -#else /* !CONFIG_MIPS_MALTA_PM */ - -static inline int mips_pm_suspend(unsigned state) -{ - return -EINVAL; -} - -#endif /* !CONFIG_MIPS_MALTA_PM */ - -#endif /* __ASM_MIPS_MACH_MALTA_PM_H__ */ diff --git a/arch/mips/include/asm/mach-malta/war.h b/arch/mips/include/asm/mach-malta/war.h deleted file mode 100644 index d62d2ffe515e..000000000000 --- a/arch/mips/include/asm/mach-malta/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_MIPS_WAR_H -#define __ASM_MIPS_MACH_MIPS_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 1 -#define MIPS_CACHE_SYNC_WAR 1 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 1 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_MIPS_WAR_H */ diff --git a/arch/mips/include/asm/mach-paravirt/cpu-feature-overrides.h b/arch/mips/include/asm/mach-paravirt/cpu-feature-overrides.h deleted file mode 100644 index 23ecf816daa7..000000000000 --- a/arch/mips/include/asm/mach-paravirt/cpu-feature-overrides.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2013 Cavium, Inc. - */ -#ifndef __ASM_MACH_PARAVIRT_CPU_FEATURE_OVERRIDES_H -#define __ASM_MACH_PARAVIRT_CPU_FEATURE_OVERRIDES_H - -#define cpu_has_4kex 1 -#define cpu_has_3k_cache 0 -#define cpu_has_tx39_cache 0 -#define cpu_has_counter 1 -#define cpu_has_llsc 1 -/* - * We Disable LL/SC on non SMP systems as it is faster to disable - * interrupts for atomic access than a LL/SC. - */ -#ifdef CONFIG_SMP -# define kernel_uses_llsc 1 -#else -# define kernel_uses_llsc 0 -#endif - -#ifdef CONFIG_CPU_CAVIUM_OCTEON -#define cpu_dcache_line_size() 128 -#define cpu_icache_line_size() 128 -#define cpu_has_octeon_cache 1 -#define cpu_has_4k_cache 0 -#else -#define cpu_has_4k_cache 1 -#endif - -#endif /* __ASM_MACH_PARAVIRT_CPU_FEATURE_OVERRIDES_H */ diff --git a/arch/mips/include/asm/mach-paravirt/irq.h b/arch/mips/include/asm/mach-paravirt/irq.h deleted file mode 100644 index 9b4d35eca977..000000000000 --- a/arch/mips/include/asm/mach-paravirt/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2013 Cavium, Inc. - */ -#ifndef __ASM_MACH_PARAVIRT_IRQ_H__ -#define __ASM_MACH_PARAVIRT_IRQ_H__ - -#define NR_IRQS 64 -#define MIPS_CPU_IRQ_BASE 1 - -#define MIPS_IRQ_PCIA (MIPS_CPU_IRQ_BASE + 8) - -#define MIPS_IRQ_MBOX0 (MIPS_CPU_IRQ_BASE + 32) -#define MIPS_IRQ_MBOX1 (MIPS_CPU_IRQ_BASE + 33) - -#endif /* __ASM_MACH_PARAVIRT_IRQ_H__ */ diff --git a/arch/mips/include/asm/mach-paravirt/kernel-entry-init.h b/arch/mips/include/asm/mach-paravirt/kernel-entry-init.h deleted file mode 100644 index c9f5769dfc8f..000000000000 --- a/arch/mips/include/asm/mach-paravirt/kernel-entry-init.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2013 Cavium, Inc - */ -#ifndef __ASM_MACH_PARAVIRT_KERNEL_ENTRY_H -#define __ASM_MACH_PARAVIRT_KERNEL_ENTRY_H - -#define CP0_EBASE $15, 1 - - .macro kernel_entry_setup -#ifdef CONFIG_SMP - mfc0 t0, CP0_EBASE - andi t0, t0, 0x3ff # CPUNum - beqz t0, 1f - # CPUs other than zero goto smp_bootstrap - j smp_bootstrap -#endif /* CONFIG_SMP */ - -1: - .endm - -/* - * Do SMP slave processor setup necessary before we can safely execute - * C code. - */ - .macro smp_slave_setup - mfc0 t0, CP0_EBASE - andi t0, t0, 0x3ff # CPUNum - slti t1, t0, NR_CPUS - bnez t1, 1f -2: - di - wait - b 2b # Unknown CPU, loop forever. -1: - PTR_LA t1, paravirt_smp_sp - PTR_SLL t0, PTR_SCALESHIFT - PTR_ADDU t1, t1, t0 -3: - PTR_L sp, 0(t1) - beqz sp, 3b # Spin until told to proceed. - - PTR_LA t1, paravirt_smp_gp - PTR_ADDU t1, t1, t0 - sync - PTR_L gp, 0(t1) - .endm - -#endif /* __ASM_MACH_PARAVIRT_KERNEL_ENTRY_H */ diff --git a/arch/mips/include/asm/mach-pnx833x/gpio.h b/arch/mips/include/asm/mach-pnx833x/gpio.h deleted file mode 100644 index 85b5b8e26118..000000000000 --- a/arch/mips/include/asm/mach-pnx833x/gpio.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * gpio.h: GPIO Support for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - */ -#ifndef __ASM_MIPS_MACH_PNX833X_GPIO_H -#define __ASM_MIPS_MACH_PNX833X_GPIO_H - -/* BIG FAT WARNING: races danger! - No protections exist here. Current users are only early init code, - when locking is not needed because no concurrency yet exists there, - and GPIO IRQ dispatcher, which does locking. - However, if many uses will ever happen, proper locking will be needed - - including locking between different uses -*/ - -#include <asm/mach-pnx833x/pnx833x.h> - -#define SET_REG_BIT(reg, bit) do { (reg |= (1 << (bit))); } while (0) -#define CLEAR_REG_BIT(reg, bit) do { (reg &= ~(1 << (bit))); } while (0) - -/* Initialize GPIO to a known state */ -static inline void pnx833x_gpio_init(void) -{ - PNX833X_PIO_DIR = 0; - PNX833X_PIO_DIR2 = 0; - PNX833X_PIO_SEL = 0; - PNX833X_PIO_SEL2 = 0; - PNX833X_PIO_INT_EDGE = 0; - PNX833X_PIO_INT_HI = 0; - PNX833X_PIO_INT_LO = 0; - - /* clear any GPIO interrupt requests */ - PNX833X_PIO_INT_CLEAR = 0xffff; - PNX833X_PIO_INT_CLEAR = 0; - PNX833X_PIO_INT_ENABLE = 0; -} - -/* Select GPIO direction for a pin */ -static inline void pnx833x_gpio_select_input(unsigned int pin) -{ - if (pin < 32) - CLEAR_REG_BIT(PNX833X_PIO_DIR, pin); - else - CLEAR_REG_BIT(PNX833X_PIO_DIR2, pin & 31); -} -static inline void pnx833x_gpio_select_output(unsigned int pin) -{ - if (pin < 32) - SET_REG_BIT(PNX833X_PIO_DIR, pin); - else - SET_REG_BIT(PNX833X_PIO_DIR2, pin & 31); -} - -/* Select GPIO or alternate function for a pin */ -static inline void pnx833x_gpio_select_function_io(unsigned int pin) -{ - if (pin < 32) - CLEAR_REG_BIT(PNX833X_PIO_SEL, pin); - else - CLEAR_REG_BIT(PNX833X_PIO_SEL2, pin & 31); -} -static inline void pnx833x_gpio_select_function_alt(unsigned int pin) -{ - if (pin < 32) - SET_REG_BIT(PNX833X_PIO_SEL, pin); - else - SET_REG_BIT(PNX833X_PIO_SEL2, pin & 31); -} - -/* Read GPIO pin */ -static inline int pnx833x_gpio_read(unsigned int pin) -{ - if (pin < 32) - return (PNX833X_PIO_IN >> pin) & 1; - else - return (PNX833X_PIO_IN2 >> (pin & 31)) & 1; -} - -/* Write GPIO pin */ -static inline void pnx833x_gpio_write(unsigned int val, unsigned int pin) -{ - if (pin < 32) { - if (val) - SET_REG_BIT(PNX833X_PIO_OUT, pin); - else - CLEAR_REG_BIT(PNX833X_PIO_OUT, pin); - } else { - if (val) - SET_REG_BIT(PNX833X_PIO_OUT2, pin & 31); - else - CLEAR_REG_BIT(PNX833X_PIO_OUT2, pin & 31); - } -} - -/* Configure GPIO interrupt */ -#define GPIO_INT_NONE 0 -#define GPIO_INT_LEVEL_LOW 1 -#define GPIO_INT_LEVEL_HIGH 2 -#define GPIO_INT_EDGE_RISING 3 -#define GPIO_INT_EDGE_FALLING 4 -#define GPIO_INT_EDGE_BOTH 5 -static inline void pnx833x_gpio_setup_irq(int when, unsigned int pin) -{ - switch (when) { - case GPIO_INT_LEVEL_LOW: - CLEAR_REG_BIT(PNX833X_PIO_INT_EDGE, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_HI, pin); - SET_REG_BIT(PNX833X_PIO_INT_LO, pin); - break; - case GPIO_INT_LEVEL_HIGH: - CLEAR_REG_BIT(PNX833X_PIO_INT_EDGE, pin); - SET_REG_BIT(PNX833X_PIO_INT_HI, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_LO, pin); - break; - case GPIO_INT_EDGE_RISING: - SET_REG_BIT(PNX833X_PIO_INT_EDGE, pin); - SET_REG_BIT(PNX833X_PIO_INT_HI, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_LO, pin); - break; - case GPIO_INT_EDGE_FALLING: - SET_REG_BIT(PNX833X_PIO_INT_EDGE, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_HI, pin); - SET_REG_BIT(PNX833X_PIO_INT_LO, pin); - break; - case GPIO_INT_EDGE_BOTH: - SET_REG_BIT(PNX833X_PIO_INT_EDGE, pin); - SET_REG_BIT(PNX833X_PIO_INT_HI, pin); - SET_REG_BIT(PNX833X_PIO_INT_LO, pin); - break; - default: - CLEAR_REG_BIT(PNX833X_PIO_INT_EDGE, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_HI, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_LO, pin); - break; - } -} - -/* Enable/disable GPIO interrupt */ -static inline void pnx833x_gpio_enable_irq(unsigned int pin) -{ - SET_REG_BIT(PNX833X_PIO_INT_ENABLE, pin); -} -static inline void pnx833x_gpio_disable_irq(unsigned int pin) -{ - CLEAR_REG_BIT(PNX833X_PIO_INT_ENABLE, pin); -} - -/* Clear GPIO interrupt request */ -static inline void pnx833x_gpio_clear_irq(unsigned int pin) -{ - SET_REG_BIT(PNX833X_PIO_INT_CLEAR, pin); - CLEAR_REG_BIT(PNX833X_PIO_INT_CLEAR, pin); -} - -#endif diff --git a/arch/mips/include/asm/mach-pnx833x/irq-mapping.h b/arch/mips/include/asm/mach-pnx833x/irq-mapping.h deleted file mode 100644 index 32d8063c1bbc..000000000000 --- a/arch/mips/include/asm/mach-pnx833x/irq-mapping.h +++ /dev/null @@ -1,112 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -/* - * irq.h: IRQ mappings for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - */ - -#ifndef __ASM_MIPS_MACH_PNX833X_IRQ_MAPPING_H -#define __ASM_MIPS_MACH_PNX833X_IRQ_MAPPING_H -/* - * The "IRQ numbers" are completely virtual. - * - * In PNX8330/1, we have 48 interrupt lines, numbered from 1 to 48. - * Let's use numbers 1..48 for PIC interrupts, number 0 for timer interrupt, - * numbers 49..64 for (virtual) GPIO interrupts. - * - * In PNX8335, we have 57 interrupt lines, numbered from 1 to 57, - * connected to PIC, which uses core hardware interrupt 2, and also - * a timer interrupt through hardware interrupt 5. - * Let's use numbers 1..64 for PIC interrupts, number 0 for timer interrupt, - * numbers 65..80 for (virtual) GPIO interrupts. - * - */ -#include <irq.h> - -#define PNX833X_TIMER_IRQ (MIPS_CPU_IRQ_BASE + 7) - -/* Interrupts supported by PIC */ -#define PNX833X_PIC_I2C0_INT (PNX833X_PIC_IRQ_BASE + 1) -#define PNX833X_PIC_I2C1_INT (PNX833X_PIC_IRQ_BASE + 2) -#define PNX833X_PIC_UART0_INT (PNX833X_PIC_IRQ_BASE + 3) -#define PNX833X_PIC_UART1_INT (PNX833X_PIC_IRQ_BASE + 4) -#define PNX833X_PIC_TS_IN0_DV_INT (PNX833X_PIC_IRQ_BASE + 5) -#define PNX833X_PIC_TS_IN0_DMA_INT (PNX833X_PIC_IRQ_BASE + 6) -#define PNX833X_PIC_GPIO_INT (PNX833X_PIC_IRQ_BASE + 7) -#define PNX833X_PIC_AUDIO_DEC_INT (PNX833X_PIC_IRQ_BASE + 8) -#define PNX833X_PIC_VIDEO_DEC_INT (PNX833X_PIC_IRQ_BASE + 9) -#define PNX833X_PIC_CONFIG_INT (PNX833X_PIC_IRQ_BASE + 10) -#define PNX833X_PIC_AOI_INT (PNX833X_PIC_IRQ_BASE + 11) -#define PNX833X_PIC_SYNC_INT (PNX833X_PIC_IRQ_BASE + 12) -#define PNX8330_PIC_SPU_INT (PNX833X_PIC_IRQ_BASE + 13) -#define PNX8335_PIC_SATA_INT (PNX833X_PIC_IRQ_BASE + 13) -#define PNX833X_PIC_OSD_INT (PNX833X_PIC_IRQ_BASE + 14) -#define PNX833X_PIC_DISP1_INT (PNX833X_PIC_IRQ_BASE + 15) -#define PNX833X_PIC_DEINTERLACER_INT (PNX833X_PIC_IRQ_BASE + 16) -#define PNX833X_PIC_DISPLAY2_INT (PNX833X_PIC_IRQ_BASE + 17) -#define PNX833X_PIC_VC_INT (PNX833X_PIC_IRQ_BASE + 18) -#define PNX833X_PIC_SC_INT (PNX833X_PIC_IRQ_BASE + 19) -#define PNX833X_PIC_IDE_INT (PNX833X_PIC_IRQ_BASE + 20) -#define PNX833X_PIC_IDE_DMA_INT (PNX833X_PIC_IRQ_BASE + 21) -#define PNX833X_PIC_TS_IN1_DV_INT (PNX833X_PIC_IRQ_BASE + 22) -#define PNX833X_PIC_TS_IN1_DMA_INT (PNX833X_PIC_IRQ_BASE + 23) -#define PNX833X_PIC_SGDX_DMA_INT (PNX833X_PIC_IRQ_BASE + 24) -#define PNX833X_PIC_TS_OUT_INT (PNX833X_PIC_IRQ_BASE + 25) -#define PNX833X_PIC_IR_INT (PNX833X_PIC_IRQ_BASE + 26) -#define PNX833X_PIC_VMSP1_INT (PNX833X_PIC_IRQ_BASE + 27) -#define PNX833X_PIC_VMSP2_INT (PNX833X_PIC_IRQ_BASE + 28) -#define PNX833X_PIC_PIBC_INT (PNX833X_PIC_IRQ_BASE + 29) -#define PNX833X_PIC_TS_IN0_TRD_INT (PNX833X_PIC_IRQ_BASE + 30) -#define PNX833X_PIC_SGDX_TPD_INT (PNX833X_PIC_IRQ_BASE + 31) -#define PNX833X_PIC_USB_INT (PNX833X_PIC_IRQ_BASE + 32) -#define PNX833X_PIC_TS_IN1_TRD_INT (PNX833X_PIC_IRQ_BASE + 33) -#define PNX833X_PIC_CLOCK_INT (PNX833X_PIC_IRQ_BASE + 34) -#define PNX833X_PIC_SGDX_PARSER_INT (PNX833X_PIC_IRQ_BASE + 35) -#define PNX833X_PIC_VMSP_DMA_INT (PNX833X_PIC_IRQ_BASE + 36) - -#if defined(CONFIG_SOC_PNX8335) -#define PNX8335_PIC_MIU_INT (PNX833X_PIC_IRQ_BASE + 37) -#define PNX8335_PIC_AVCHIP_IRQ_INT (PNX833X_PIC_IRQ_BASE + 38) -#define PNX8335_PIC_SYNC_HD_INT (PNX833X_PIC_IRQ_BASE + 39) -#define PNX8335_PIC_DISP_HD_INT (PNX833X_PIC_IRQ_BASE + 40) -#define PNX8335_PIC_DISP_SCALER_INT (PNX833X_PIC_IRQ_BASE + 41) -#define PNX8335_PIC_OSD_HD1_INT (PNX833X_PIC_IRQ_BASE + 42) -#define PNX8335_PIC_DTL_WRITER_Y_INT (PNX833X_PIC_IRQ_BASE + 43) -#define PNX8335_PIC_DTL_WRITER_C_INT (PNX833X_PIC_IRQ_BASE + 44) -#define PNX8335_PIC_DTL_EMULATOR_Y_IR_INT (PNX833X_PIC_IRQ_BASE + 45) -#define PNX8335_PIC_DTL_EMULATOR_C_IR_INT (PNX833X_PIC_IRQ_BASE + 46) -#define PNX8335_PIC_DENC_TTX_INT (PNX833X_PIC_IRQ_BASE + 47) -#define PNX8335_PIC_MMI_SIF0_INT (PNX833X_PIC_IRQ_BASE + 48) -#define PNX8335_PIC_MMI_SIF1_INT (PNX833X_PIC_IRQ_BASE + 49) -#define PNX8335_PIC_MMI_CDMMU_INT (PNX833X_PIC_IRQ_BASE + 50) -#define PNX8335_PIC_PIBCS_INT (PNX833X_PIC_IRQ_BASE + 51) -#define PNX8335_PIC_ETHERNET_INT (PNX833X_PIC_IRQ_BASE + 52) -#define PNX8335_PIC_VMSP1_0_INT (PNX833X_PIC_IRQ_BASE + 53) -#define PNX8335_PIC_VMSP1_1_INT (PNX833X_PIC_IRQ_BASE + 54) -#define PNX8335_PIC_VMSP1_DMA_INT (PNX833X_PIC_IRQ_BASE + 55) -#define PNX8335_PIC_TDGR_DE_INT (PNX833X_PIC_IRQ_BASE + 56) -#define PNX8335_PIC_IR1_IRQ_INT (PNX833X_PIC_IRQ_BASE + 57) -#endif - -/* GPIO interrupts */ -#define PNX833X_GPIO_0_INT (PNX833X_GPIO_IRQ_BASE + 0) -#define PNX833X_GPIO_1_INT (PNX833X_GPIO_IRQ_BASE + 1) -#define PNX833X_GPIO_2_INT (PNX833X_GPIO_IRQ_BASE + 2) -#define PNX833X_GPIO_3_INT (PNX833X_GPIO_IRQ_BASE + 3) -#define PNX833X_GPIO_4_INT (PNX833X_GPIO_IRQ_BASE + 4) -#define PNX833X_GPIO_5_INT (PNX833X_GPIO_IRQ_BASE + 5) -#define PNX833X_GPIO_6_INT (PNX833X_GPIO_IRQ_BASE + 6) -#define PNX833X_GPIO_7_INT (PNX833X_GPIO_IRQ_BASE + 7) -#define PNX833X_GPIO_8_INT (PNX833X_GPIO_IRQ_BASE + 8) -#define PNX833X_GPIO_9_INT (PNX833X_GPIO_IRQ_BASE + 9) -#define PNX833X_GPIO_10_INT (PNX833X_GPIO_IRQ_BASE + 10) -#define PNX833X_GPIO_11_INT (PNX833X_GPIO_IRQ_BASE + 11) -#define PNX833X_GPIO_12_INT (PNX833X_GPIO_IRQ_BASE + 12) -#define PNX833X_GPIO_13_INT (PNX833X_GPIO_IRQ_BASE + 13) -#define PNX833X_GPIO_14_INT (PNX833X_GPIO_IRQ_BASE + 14) -#define PNX833X_GPIO_15_INT (PNX833X_GPIO_IRQ_BASE + 15) - -#endif diff --git a/arch/mips/include/asm/mach-pnx833x/irq.h b/arch/mips/include/asm/mach-pnx833x/irq.h deleted file mode 100644 index b7a6dab5b9f7..000000000000 --- a/arch/mips/include/asm/mach-pnx833x/irq.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * irq.h: IRQ mappings for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - */ - -#ifndef __ASM_MIPS_MACH_PNX833X_IRQ_H -#define __ASM_MIPS_MACH_PNX833X_IRQ_H -/* - * The "IRQ numbers" are completely virtual. - * - * In PNX8330/1, we have 48 interrupt lines, numbered from 1 to 48. - * Let's use numbers 1..48 for PIC interrupts, number 0 for timer interrupt, - * numbers 49..64 for (virtual) GPIO interrupts. - * - * In PNX8335, we have 57 interrupt lines, numbered from 1 to 57, - * connected to PIC, which uses core hardware interrupt 2, and also - * a timer interrupt through hardware interrupt 5. - * Let's use numbers 1..64 for PIC interrupts, number 0 for timer interrupt, - * numbers 65..80 for (virtual) GPIO interrupts. - * - */ -#if defined(CONFIG_SOC_PNX8335) - #define PNX833X_PIC_NUM_IRQ 58 -#else - #define PNX833X_PIC_NUM_IRQ 37 -#endif - -#define MIPS_CPU_NUM_IRQ 8 -#define PNX833X_GPIO_NUM_IRQ 16 - -#define MIPS_CPU_IRQ_BASE 0 -#define PNX833X_PIC_IRQ_BASE (MIPS_CPU_IRQ_BASE + MIPS_CPU_NUM_IRQ) -#define PNX833X_GPIO_IRQ_BASE (PNX833X_PIC_IRQ_BASE + PNX833X_PIC_NUM_IRQ) -#define NR_IRQS (MIPS_CPU_NUM_IRQ + PNX833X_PIC_NUM_IRQ + PNX833X_GPIO_NUM_IRQ) - -#endif diff --git a/arch/mips/include/asm/mach-pnx833x/pnx833x.h b/arch/mips/include/asm/mach-pnx833x/pnx833x.h deleted file mode 100644 index 00bb67a36386..000000000000 --- a/arch/mips/include/asm/mach-pnx833x/pnx833x.h +++ /dev/null @@ -1,189 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * pnx833x.h: Register mappings for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - */ -#ifndef __ASM_MIPS_MACH_PNX833X_PNX833X_H -#define __ASM_MIPS_MACH_PNX833X_PNX833X_H - -/* All regs are accessed in KSEG1 */ -#define PNX833X_BASE (0xa0000000ul + 0x17E00000ul) - -#define PNX833X_REG(offs) (*((volatile unsigned long *)(PNX833X_BASE + offs))) - -/* Registers are named exactly as in PNX833X docs, just with PNX833X_ prefix */ - -/* Read access to multibit fields */ -#define PNX833X_BIT(val, reg, field) ((val) & PNX833X_##reg##_##field) -#define PNX833X_REGBIT(reg, field) PNX833X_BIT(PNX833X_##reg, reg, field) - -/* Use PNX833X_FIELD to extract a field from val */ -#define PNX_FIELD(cpu, val, reg, field) \ - (((val) & PNX##cpu##_##reg##_##field##_MASK) >> \ - PNX##cpu##_##reg##_##field##_SHIFT) -#define PNX833X_FIELD(val, reg, field) PNX_FIELD(833X, val, reg, field) -#define PNX8330_FIELD(val, reg, field) PNX_FIELD(8330, val, reg, field) -#define PNX8335_FIELD(val, reg, field) PNX_FIELD(8335, val, reg, field) - -/* Use PNX833X_REGFIELD to extract a field from a register */ -#define PNX833X_REGFIELD(reg, field) PNX833X_FIELD(PNX833X_##reg, reg, field) -#define PNX8330_REGFIELD(reg, field) PNX8330_FIELD(PNX8330_##reg, reg, field) -#define PNX8335_REGFIELD(reg, field) PNX8335_FIELD(PNX8335_##reg, reg, field) - - -#define PNX_WRITEFIELD(cpu, val, reg, field) \ - (PNX##cpu##_##reg = (PNX##cpu##_##reg & ~(PNX##cpu##_##reg##_##field##_MASK)) | \ - ((val) << PNX##cpu##_##reg##_##field##_SHIFT)) -#define PNX833X_WRITEFIELD(val, reg, field) \ - PNX_WRITEFIELD(833X, val, reg, field) -#define PNX8330_WRITEFIELD(val, reg, field) \ - PNX_WRITEFIELD(8330, val, reg, field) -#define PNX8335_WRITEFIELD(val, reg, field) \ - PNX_WRITEFIELD(8335, val, reg, field) - - -/* Macros to detect CPU type */ - -#define PNX833X_CONFIG_MODULE_ID PNX833X_REG(0x7FFC) -#define PNX833X_CONFIG_MODULE_ID_MAJREV_MASK 0x0000f000 -#define PNX833X_CONFIG_MODULE_ID_MAJREV_SHIFT 12 -#define PNX8330_CONFIG_MODULE_MAJREV 4 -#define PNX8335_CONFIG_MODULE_MAJREV 5 -#define CPU_IS_PNX8330 (PNX833X_REGFIELD(CONFIG_MODULE_ID, MAJREV) == \ - PNX8330_CONFIG_MODULE_MAJREV) -#define CPU_IS_PNX8335 (PNX833X_REGFIELD(CONFIG_MODULE_ID, MAJREV) == \ - PNX8335_CONFIG_MODULE_MAJREV) - - - -#define PNX833X_RESET_CONTROL PNX833X_REG(0x8004) -#define PNX833X_RESET_CONTROL_2 PNX833X_REG(0x8014) - -#define PNX833X_PIC_REG(offs) PNX833X_REG(0x01000 + (offs)) -#define PNX833X_PIC_INT_PRIORITY PNX833X_PIC_REG(0x0) -#define PNX833X_PIC_INT_SRC PNX833X_PIC_REG(0x4) -#define PNX833X_PIC_INT_SRC_INT_SRC_MASK 0x00000FF8ul /* bits 11:3 */ -#define PNX833X_PIC_INT_SRC_INT_SRC_SHIFT 3 -#define PNX833X_PIC_INT_REG(irq) PNX833X_PIC_REG(0x10 + 4*(irq)) - -#define PNX833X_CLOCK_CPUCP_CTL PNX833X_REG(0x9228) -#define PNX833X_CLOCK_CPUCP_CTL_EXIT_RESET 0x00000002ul /* bit 1 */ -#define PNX833X_CLOCK_CPUCP_CTL_DIV_CLOCK_MASK 0x00000018ul /* bits 4:3 */ -#define PNX833X_CLOCK_CPUCP_CTL_DIV_CLOCK_SHIFT 3 - -#define PNX8335_CLOCK_PLL_CPU_CTL PNX833X_REG(0x9020) -#define PNX8335_CLOCK_PLL_CPU_CTL_FREQ_MASK 0x1f -#define PNX8335_CLOCK_PLL_CPU_CTL_FREQ_SHIFT 0 - -#define PNX833X_CONFIG_MUX PNX833X_REG(0x7004) -#define PNX833X_CONFIG_MUX_IDE_MUX 0x00000080 /* bit 7 */ - -#define PNX8330_CONFIG_POLYFUSE_7 PNX833X_REG(0x7040) -#define PNX8330_CONFIG_POLYFUSE_7_BOOT_MODE_MASK 0x00180000 -#define PNX8330_CONFIG_POLYFUSE_7_BOOT_MODE_SHIFT 19 - -#define PNX833X_PIO_IN PNX833X_REG(0xF000) -#define PNX833X_PIO_OUT PNX833X_REG(0xF004) -#define PNX833X_PIO_DIR PNX833X_REG(0xF008) -#define PNX833X_PIO_SEL PNX833X_REG(0xF014) -#define PNX833X_PIO_INT_EDGE PNX833X_REG(0xF020) -#define PNX833X_PIO_INT_HI PNX833X_REG(0xF024) -#define PNX833X_PIO_INT_LO PNX833X_REG(0xF028) -#define PNX833X_PIO_INT_STATUS PNX833X_REG(0xFFE0) -#define PNX833X_PIO_INT_ENABLE PNX833X_REG(0xFFE4) -#define PNX833X_PIO_INT_CLEAR PNX833X_REG(0xFFE8) -#define PNX833X_PIO_IN2 PNX833X_REG(0xF05C) -#define PNX833X_PIO_OUT2 PNX833X_REG(0xF060) -#define PNX833X_PIO_DIR2 PNX833X_REG(0xF064) -#define PNX833X_PIO_SEL2 PNX833X_REG(0xF068) - -#define PNX833X_UART0_PORTS_START (PNX833X_BASE + 0xB000) -#define PNX833X_UART0_PORTS_END (PNX833X_BASE + 0xBFFF) -#define PNX833X_UART1_PORTS_START (PNX833X_BASE + 0xC000) -#define PNX833X_UART1_PORTS_END (PNX833X_BASE + 0xCFFF) - -#define PNX833X_USB_PORTS_START (PNX833X_BASE + 0x19000) -#define PNX833X_USB_PORTS_END (PNX833X_BASE + 0x19FFF) - -#define PNX833X_CONFIG_USB PNX833X_REG(0x7008) - -#define PNX833X_I2C0_PORTS_START (PNX833X_BASE + 0xD000) -#define PNX833X_I2C0_PORTS_END (PNX833X_BASE + 0xDFFF) -#define PNX833X_I2C1_PORTS_START (PNX833X_BASE + 0xE000) -#define PNX833X_I2C1_PORTS_END (PNX833X_BASE + 0xEFFF) - -#define PNX833X_IDE_PORTS_START (PNX833X_BASE + 0x1A000) -#define PNX833X_IDE_PORTS_END (PNX833X_BASE + 0x1AFFF) -#define PNX833X_IDE_MODULE_ID PNX833X_REG(0x1AFFC) - -#define PNX833X_IDE_MODULE_ID_MODULE_ID_MASK 0xFFFF0000 -#define PNX833X_IDE_MODULE_ID_MODULE_ID_SHIFT 16 -#define PNX833X_IDE_MODULE_ID_VALUE 0xA009 - - -#define PNX833X_MIU_SEL0 PNX833X_REG(0x2004) -#define PNX833X_MIU_SEL0_TIMING PNX833X_REG(0x2008) -#define PNX833X_MIU_SEL1 PNX833X_REG(0x200C) -#define PNX833X_MIU_SEL1_TIMING PNX833X_REG(0x2010) -#define PNX833X_MIU_SEL2 PNX833X_REG(0x2014) -#define PNX833X_MIU_SEL2_TIMING PNX833X_REG(0x2018) -#define PNX833X_MIU_SEL3 PNX833X_REG(0x201C) -#define PNX833X_MIU_SEL3_TIMING PNX833X_REG(0x2020) - -#define PNX833X_MIU_SEL0_SPI_MODE_ENABLE_MASK (1 << 14) -#define PNX833X_MIU_SEL0_SPI_MODE_ENABLE_SHIFT 14 - -#define PNX833X_MIU_SEL0_BURST_MODE_ENABLE_MASK (1 << 7) -#define PNX833X_MIU_SEL0_BURST_MODE_ENABLE_SHIFT 7 - -#define PNX833X_MIU_SEL0_BURST_PAGE_LEN_MASK (0xF << 9) -#define PNX833X_MIU_SEL0_BURST_PAGE_LEN_SHIFT 9 - -#define PNX833X_MIU_CONFIG_SPI PNX833X_REG(0x2000) - -#define PNX833X_MIU_CONFIG_SPI_OPCODE_MASK (0xFF << 3) -#define PNX833X_MIU_CONFIG_SPI_OPCODE_SHIFT 3 - -#define PNX833X_MIU_CONFIG_SPI_DATA_ENABLE_MASK (1 << 2) -#define PNX833X_MIU_CONFIG_SPI_DATA_ENABLE_SHIFT 2 - -#define PNX833X_MIU_CONFIG_SPI_ADDR_ENABLE_MASK (1 << 1) -#define PNX833X_MIU_CONFIG_SPI_ADDR_ENABLE_SHIFT 1 - -#define PNX833X_MIU_CONFIG_SPI_SYNC_MASK (1 << 0) -#define PNX833X_MIU_CONFIG_SPI_SYNC_SHIFT 0 - -#define PNX833X_WRITE_CONFIG_SPI(opcode, data_enable, addr_enable, sync) \ - (PNX833X_MIU_CONFIG_SPI = \ - ((opcode) << PNX833X_MIU_CONFIG_SPI_OPCODE_SHIFT) | \ - ((data_enable) << PNX833X_MIU_CONFIG_SPI_DATA_ENABLE_SHIFT) | \ - ((addr_enable) << PNX833X_MIU_CONFIG_SPI_ADDR_ENABLE_SHIFT) | \ - ((sync) << PNX833X_MIU_CONFIG_SPI_SYNC_SHIFT)) - -#define PNX8335_IP3902_PORTS_START (PNX833X_BASE + 0x2F000) -#define PNX8335_IP3902_PORTS_END (PNX833X_BASE + 0x2FFFF) -#define PNX8335_IP3902_MODULE_ID PNX833X_REG(0x2FFFC) - -#define PNX8335_IP3902_MODULE_ID_MODULE_ID_MASK 0xFFFF0000 -#define PNX8335_IP3902_MODULE_ID_MODULE_ID_SHIFT 16 -#define PNX8335_IP3902_MODULE_ID_VALUE 0x3902 - - /* I/O location(gets remapped)*/ -#define PNX8335_NAND_BASE 0x18000000 -/* I/O location with CLE high */ -#define PNX8335_NAND_CLE_MASK 0x00100000 -/* I/O location with ALE high */ -#define PNX8335_NAND_ALE_MASK 0x00010000 - -#define PNX8335_SATA_PORTS_START (PNX833X_BASE + 0x2E000) -#define PNX8335_SATA_PORTS_END (PNX833X_BASE + 0x2EFFF) -#define PNX8335_SATA_MODULE_ID PNX833X_REG(0x2EFFC) - -#define PNX8335_SATA_MODULE_ID_MODULE_ID_MASK 0xFFFF0000 -#define PNX8335_SATA_MODULE_ID_MODULE_ID_SHIFT 16 -#define PNX8335_SATA_MODULE_ID_VALUE 0xA099 - -#endif diff --git a/arch/mips/include/asm/mach-rc32434/war.h b/arch/mips/include/asm/mach-rc32434/war.h deleted file mode 100644 index af430d26f713..000000000000 --- a/arch/mips/include/asm/mach-rc32434/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_MIPS_WAR_H -#define __ASM_MIPS_MACH_MIPS_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 1 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_MIPS_WAR_H */ diff --git a/arch/mips/include/asm/mach-rm/war.h b/arch/mips/include/asm/mach-rm/war.h deleted file mode 100644 index eca16d167c2f..000000000000 --- a/arch/mips/include/asm/mach-rm/war.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_RM_WAR_H -#define __ASM_MIPS_MACH_RM_WAR_H - -/* - * The RM200C seems to have been shipped only with V2.0 R4600s - */ - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 1 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_RM_WAR_H */ diff --git a/arch/mips/include/asm/mach-sibyte/war.h b/arch/mips/include/asm/mach-sibyte/war.h deleted file mode 100644 index 4755b6116807..000000000000 --- a/arch/mips/include/asm/mach-sibyte/war.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_SIBYTE_WAR_H -#define __ASM_MIPS_MACH_SIBYTE_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 - -#if defined(CONFIG_SB1_PASS_2_WORKAROUNDS) - -#ifndef __ASSEMBLY__ -extern int sb1250_m3_workaround_needed(void); -#endif - -#define BCM1250_M3_WAR sb1250_m3_workaround_needed() -#define SIBYTE_1956_WAR 1 - -#else - -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 - -#endif - -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 0 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_SIBYTE_WAR_H */ diff --git a/arch/mips/include/asm/mach-tx49xx/war.h b/arch/mips/include/asm/mach-tx49xx/war.h deleted file mode 100644 index 445abb4eb769..000000000000 --- a/arch/mips/include/asm/mach-tx49xx/war.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2002, 2004, 2007 by Ralf Baechle <ralf@linux-mips.org> - */ -#ifndef __ASM_MIPS_MACH_TX49XX_WAR_H -#define __ASM_MIPS_MACH_TX49XX_WAR_H - -#define R4600_V1_INDEX_ICACHEOP_WAR 0 -#define R4600_V1_HIT_CACHEOP_WAR 0 -#define R4600_V2_HIT_CACHEOP_WAR 0 -#define BCM1250_M3_WAR 0 -#define SIBYTE_1956_WAR 0 -#define MIPS4K_ICACHE_REFILL_WAR 0 -#define MIPS_CACHE_SYNC_WAR 0 -#define TX49XX_ICACHE_INDEX_INV_WAR 1 -#define ICACHE_REFILLS_WORKAROUND_WAR 0 -#define R10000_LLSC_WAR 0 -#define MIPS34K_MISSED_ITLB_WAR 0 - -#endif /* __ASM_MIPS_MACH_TX49XX_WAR_H */ diff --git a/arch/mips/include/asm/mips-boards/malta.h b/arch/mips/include/asm/mips-boards/malta.h index 65de4fb06096..254be3d62519 100644 --- a/arch/mips/include/asm/mips-boards/malta.h +++ b/arch/mips/include/asm/mips-boards/malta.h @@ -92,4 +92,6 @@ static inline unsigned long get_msc_port_base(unsigned long reg) #define MALTA_JMPRS_REG 0x1f000210 +extern void __init *malta_dt_shim(void *fdt); + #endif /* __ASM_MIPS_BOARDS_MALTA_H */ diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index 4ddc12e4444a..a0e8ae5497b6 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -389,6 +389,13 @@ #define ST0_CU3 0x80000000 #define ST0_XX 0x80000000 /* MIPS IV naming */ +/* in-kernel enabled CUs */ +#ifdef CONFIG_CPU_LOONGSON64 +#define ST0_KERNEL_CUMASK (ST0_CU0 | ST0_CU2) +#else +#define ST0_KERNEL_CUMASK ST0_CU0 +#endif + /* * Bitfields and bit numbers in the coprocessor 0 IntCtl register. (MIPSR2) */ @@ -1706,12 +1713,6 @@ do { \ #define read_c0_count() __read_32bit_c0_register($9, 0) #define write_c0_count(val) __write_32bit_c0_register($9, 0, val) -#define read_c0_count2() __read_32bit_c0_register($9, 6) /* pnx8550 */ -#define write_c0_count2(val) __write_32bit_c0_register($9, 6, val) - -#define read_c0_count3() __read_32bit_c0_register($9, 7) /* pnx8550 */ -#define write_c0_count3(val) __write_32bit_c0_register($9, 7, val) - #define read_c0_entryhi() __read_ulong_c0_register($10, 0) #define write_c0_entryhi(val) __write_ulong_c0_register($10, 0, val) @@ -1730,12 +1731,6 @@ do { \ #define read_c0_guestctl0ext() __read_32bit_c0_register($11, 4) #define write_c0_guestctl0ext(val) __write_32bit_c0_register($11, 4, val) -#define read_c0_compare2() __read_32bit_c0_register($11, 6) /* pnx8550 */ -#define write_c0_compare2(val) __write_32bit_c0_register($11, 6, val) - -#define read_c0_compare3() __read_32bit_c0_register($11, 7) /* pnx8550 */ -#define write_c0_compare3(val) __write_32bit_c0_register($11, 7, val) - #define read_c0_status() __read_32bit_c0_register($12, 0) #define write_c0_status(val) __write_32bit_c0_register($12, 0, val) @@ -2728,7 +2723,7 @@ static inline void tlb_probe(void) static inline void tlb_read(void) { -#if MIPS34K_MISSED_ITLB_WAR +#ifdef CONFIG_WAR_MIPS34K_MISSED_ITLB int res = 0; __asm__ __volatile__( @@ -2750,7 +2745,7 @@ static inline void tlb_read(void) "tlbr\n\t" ".set reorder"); -#if MIPS34K_MISSED_ITLB_WAR +#ifdef CONFIG_WAR_MIPS34K_MISSED_ITLB if ((res & _ULCAST_(1))) __asm__ __volatile__( " .set push \n" diff --git a/arch/mips/include/asm/netlogic/psb-bootinfo.h b/arch/mips/include/asm/netlogic/psb-bootinfo.h index 6878307f0ee6..c716e9397113 100644 --- a/arch/mips/include/asm/netlogic/psb-bootinfo.h +++ b/arch/mips/include/asm/netlogic/psb-bootinfo.h @@ -77,21 +77,6 @@ struct psb_info { uint64_t avail_mem_map; }; -enum { - NETLOGIC_IO_SPACE = 0x10, - PCIX_IO_SPACE, - PCIX_CFG_SPACE, - PCIX_MEMORY_SPACE, - HT_IO_SPACE, - HT_CFG_SPACE, - HT_MEMORY_SPACE, - SRAM_SPACE, - FLASH_CONTROLLER_SPACE -}; - -#define NLM_MAX_ARGS 64 -#define NLM_MAX_ENVS 32 - /* This is what netlboot passes and linux boot_mem_map is subtly different */ #define NLM_BOOT_MEM_MAP_MAX 32 struct nlm_boot_mem_map { @@ -102,6 +87,7 @@ struct nlm_boot_mem_map { uint32_t type; /* type of memory segment */ } map[NLM_BOOT_MEM_MAP_MAX]; }; +#define NLM_BOOT_MEM_RAM 1 /* Pointer to saved boot loader info */ extern struct psb_info nlm_prom_info; diff --git a/arch/mips/include/asm/octeon/cvmx-bootinfo.h b/arch/mips/include/asm/octeon/cvmx-bootinfo.h index 62787765575e..c114a7ba0bad 100644 --- a/arch/mips/include/asm/octeon/cvmx-bootinfo.h +++ b/arch/mips/include/asm/octeon/cvmx-bootinfo.h @@ -295,6 +295,8 @@ enum cvmx_board_types_enum { */ CVMX_BOARD_TYPE_CUST_PRIVATE_MIN = 20001, CVMX_BOARD_TYPE_UBNT_E100 = 20002, + CVMX_BOARD_TYPE_UBNT_E200 = 20003, + CVMX_BOARD_TYPE_UBNT_E220 = 20005, CVMX_BOARD_TYPE_CUST_DSR1000N = 20006, CVMX_BOARD_TYPE_KONTRON_S1901 = 21901, CVMX_BOARD_TYPE_CUST_PRIVATE_MAX = 30000, @@ -396,6 +398,8 @@ static inline const char *cvmx_board_type_to_string(enum /* Customer private range */ ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_CUST_PRIVATE_MIN) ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_UBNT_E100) + ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_UBNT_E200) + ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_UBNT_E220) ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_CUST_DSR1000N) ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_KONTRON_S1901) ENUM_BRD_TYPE_CASE(CVMX_BOARD_TYPE_CUST_PRIVATE_MAX) diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h index e26dc41a8a68..2362842ee2b5 100644 --- a/arch/mips/include/asm/pgtable-bits.h +++ b/arch/mips/include/asm/pgtable-bits.h @@ -249,11 +249,6 @@ static inline uint64_t pte_to_entrylo(unsigned long pte_val) #define _CACHE_CACHABLE_NONCOHERENT (5<<_CACHE_SHIFT) -#elif defined(CONFIG_MACH_INGENIC) - -/* Ingenic uses the WA bit to achieve write-combine memory writes */ -#define _CACHE_UNCACHED_ACCELERATED (1<<_CACHE_SHIFT) - #endif #ifndef _CACHE_CACHABLE_NO_WA diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index dd7a0f552cac..e5ef0fdd4838 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -37,8 +37,6 @@ struct vm_area_struct; _PAGE_GLOBAL | _page_cachable_default) #define PAGE_KERNEL_NC __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ _PAGE_GLOBAL | _CACHE_CACHABLE_NONCOHERENT) -#define PAGE_USERIO __pgprot(_PAGE_PRESENT | _PAGE_WRITE | \ - _page_cachable_default) #define PAGE_KERNEL_UNCACHED __pgprot(_PAGE_PRESENT | __READABLE | \ __WRITEABLE | _PAGE_GLOBAL | _CACHE_UNCACHED) diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 856e12f6063d..7834e7c0c78a 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -29,6 +29,7 @@ */ extern unsigned int vced_count, vcei_count; +extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #ifdef CONFIG_32BIT #ifdef CONFIG_KVM_GUEST diff --git a/arch/mips/include/asm/r4k-timer.h b/arch/mips/include/asm/r4k-timer.h index afe9e0e03fe9..6e7361629348 100644 --- a/arch/mips/include/asm/r4k-timer.h +++ b/arch/mips/include/asm/r4k-timer.h @@ -5,8 +5,8 @@ * * Copyright (C) 2008 by Ralf Baechle (ralf@linux-mips.org) */ -#ifndef __ASM_R4K_TYPES_H -#define __ASM_R4K_TYPES_H +#ifndef __ASM_R4K_TIMER_H +#define __ASM_R4K_TIMER_H #include <linux/compiler.h> @@ -27,4 +27,4 @@ static inline void synchronise_count_slave(int cpu) #endif -#endif /* __ASM_R4K_TYPES_H */ +#endif /* __ASM_R4K_TIMER_H */ diff --git a/arch/mips/include/asm/sgi/heart.h b/arch/mips/include/asm/sgi/heart.h index c423221b4792..0d03751955c4 100644 --- a/arch/mips/include/asm/sgi/heart.h +++ b/arch/mips/include/asm/sgi/heart.h @@ -264,6 +264,57 @@ struct ip30_heart_regs { /* 0x0ff00000 */ #define HC_NCOR_MEM_ERR BIT(1) #define HC_COR_MEM_ERR BIT(0) +/* + * HEART has 64 interrupt vectors available to it, subdivided into five + * priority levels. They are numbered 0 to 63. + */ +#define HEART_NUM_IRQS 64 + +/* + * These are the five interrupt priority levels and their corresponding + * CPU IPx interrupt pins. + * + * Level 4 - Error Interrupts. + * Level 3 - HEART timer interrupt. + * Level 2 - CPU IPI, CPU debug, power putton, general device interrupts. + * Level 1 - General device interrupts. + * Level 0 - General device GFX flow control interrupts. + */ +#define HEART_L4_INT_MASK 0xfff8000000000000ULL /* IP6 */ +#define HEART_L3_INT_MASK 0x0004000000000000ULL /* IP5 */ +#define HEART_L2_INT_MASK 0x0003ffff00000000ULL /* IP4 */ +#define HEART_L1_INT_MASK 0x00000000ffff0000ULL /* IP3 */ +#define HEART_L0_INT_MASK 0x000000000000ffffULL /* IP2 */ + +/* HEART L0 Interrupts (Low Priority) */ +#define HEART_L0_INT_GENERIC 0 +#define HEART_L0_INT_FLOW_CTRL_HWTR_0 1 +#define HEART_L0_INT_FLOW_CTRL_HWTR_1 2 + +/* HEART L2 Interrupts (High Priority) */ +#define HEART_L2_INT_RESCHED_CPU_0 46 +#define HEART_L2_INT_RESCHED_CPU_1 47 +#define HEART_L2_INT_CALL_CPU_0 48 +#define HEART_L2_INT_CALL_CPU_1 49 + +/* HEART L3 Interrupts (Compare/Counter Timer) */ +#define HEART_L3_INT_TIMER 50 + +/* HEART L4 Interrupts (Errors) */ +#define HEART_L4_INT_XWID_ERR_9 51 +#define HEART_L4_INT_XWID_ERR_A 52 +#define HEART_L4_INT_XWID_ERR_B 53 +#define HEART_L4_INT_XWID_ERR_C 54 +#define HEART_L4_INT_XWID_ERR_D 55 +#define HEART_L4_INT_XWID_ERR_E 56 +#define HEART_L4_INT_XWID_ERR_F 57 +#define HEART_L4_INT_XWID_ERR_XBOW 58 +#define HEART_L4_INT_CPU_BUS_ERR_0 59 +#define HEART_L4_INT_CPU_BUS_ERR_1 60 +#define HEART_L4_INT_CPU_BUS_ERR_2 61 +#define HEART_L4_INT_CPU_BUS_ERR_3 62 +#define HEART_L4_INT_HEART_EXCP 63 + extern struct ip30_heart_regs __iomem *heart_regs; #define heart_read ____raw_readq diff --git a/arch/mips/include/asm/stackframe.h b/arch/mips/include/asm/stackframe.h index 3e8d2aaf96af..aa430a6c68b2 100644 --- a/arch/mips/include/asm/stackframe.h +++ b/arch/mips/include/asm/stackframe.h @@ -450,7 +450,7 @@ */ .macro CLI mfc0 t0, CP0_STATUS - li t1, ST0_CU0 | STATMASK + li t1, ST0_KERNEL_CUMASK | STATMASK or t0, t1 xori t0, STATMASK mtc0 t0, CP0_STATUS @@ -463,7 +463,7 @@ */ .macro STI mfc0 t0, CP0_STATUS - li t1, ST0_CU0 | STATMASK + li t1, ST0_KERNEL_CUMASK | STATMASK or t0, t1 xori t0, STATMASK & ~1 mtc0 t0, CP0_STATUS @@ -477,7 +477,7 @@ */ .macro KMODE mfc0 t0, CP0_STATUS - li t1, ST0_CU0 | (STATMASK & ~1) + li t1, ST0_KERNEL_CUMASK | (STATMASK & ~1) #if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX) andi t2, t0, ST0_IEP srl t2, 2 diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h index 0b0a93bf83cd..a4374b4cb88f 100644 --- a/arch/mips/include/asm/switch_to.h +++ b/arch/mips/include/asm/switch_to.h @@ -117,6 +117,8 @@ do { \ __restore_dsp(next); \ } \ if (cop2_present) { \ + u32 status = read_c0_status(); \ + \ set_c0_status(ST0_CU2); \ if ((KSTK_STATUS(prev) & ST0_CU2)) { \ if (cop2_lazy_restore) \ @@ -127,7 +129,7 @@ do { \ !cop2_lazy_restore) { \ cop2_restore(next); \ } \ - clear_c0_status(ST0_CU2); \ + write_c0_status(status); \ } \ __clear_r5_hw_ll_bit(); \ __clear_software_ll_bit(); \ diff --git a/arch/mips/include/asm/txx9/tx4939.h b/arch/mips/include/asm/txx9/tx4939.h index 00805ac6e9fc..abf980af9ef4 100644 --- a/arch/mips/include/asm/txx9/tx4939.h +++ b/arch/mips/include/asm/txx9/tx4939.h @@ -498,7 +498,6 @@ struct tx4939_vpc_desc { ((((mst) + 245/2) / 245UL * 429 * 16 + 19) / 19 / 2) void tx4939_wdt_init(void); -void tx4939_add_memory_regions(void); void tx4939_setup(void); void tx4939_time_init(unsigned int tmrnr); void tx4939_sio_init(unsigned int sclk, unsigned int cts_mask); diff --git a/arch/mips/include/asm/war.h b/arch/mips/include/asm/war.h index e43f800e662d..21443f096238 100644 --- a/arch/mips/include/asm/war.h +++ b/arch/mips/include/asm/war.h @@ -9,8 +9,6 @@ #ifndef _ASM_WAR_H #define _ASM_WAR_H -#include <war.h> - /* * Work around certain R4000 CPU errata (as implemented by GCC): * @@ -72,152 +70,4 @@ #define DADDI_WAR 0 #endif -/* - * Another R4600 erratum. Due to the lack of errata information the exact - * technical details aren't known. I've experimentally found that disabling - * interrupts during indexed I-cache flushes seems to be sufficient to deal - * with the issue. - */ -#ifndef R4600_V1_INDEX_ICACHEOP_WAR -#error Check setting of R4600_V1_INDEX_ICACHEOP_WAR for your platform -#endif - -/* - * Pleasures of the R4600 V1.x. Cite from the IDT R4600 V1.7 errata: - * - * 18. The CACHE instructions Hit_Writeback_Invalidate_D, Hit_Writeback_D, - * Hit_Invalidate_D and Create_Dirty_Excl_D should only be - * executed if there is no other dcache activity. If the dcache is - * accessed for another instruction immeidately preceding when these - * cache instructions are executing, it is possible that the dcache - * tag match outputs used by these cache instructions will be - * incorrect. These cache instructions should be preceded by at least - * four instructions that are not any kind of load or store - * instruction. - * - * This is not allowed: lw - * nop - * nop - * nop - * cache Hit_Writeback_Invalidate_D - * - * This is allowed: lw - * nop - * nop - * nop - * nop - * cache Hit_Writeback_Invalidate_D - */ -#ifndef R4600_V1_HIT_CACHEOP_WAR -#error Check setting of R4600_V1_HIT_CACHEOP_WAR for your platform -#endif - - -/* - * Writeback and invalidate the primary cache dcache before DMA. - * - * R4600 v2.0 bug: "The CACHE instructions Hit_Writeback_Inv_D, - * Hit_Writeback_D, Hit_Invalidate_D and Create_Dirty_Exclusive_D will only - * operate correctly if the internal data cache refill buffer is empty. These - * CACHE instructions should be separated from any potential data cache miss - * by a load instruction to an uncached address to empty the response buffer." - * (Revision 2.0 device errata from IDT available on https://www.idt.com/ - * in .pdf format.) - */ -#ifndef R4600_V2_HIT_CACHEOP_WAR -#error Check setting of R4600_V2_HIT_CACHEOP_WAR for your platform -#endif - -/* - * Workaround for the Sibyte M3 errata the text of which can be found at - * - * http://sibyte.broadcom.com/hw/bcm1250/docs/pass2errata.txt - * - * This will enable the use of a special TLB refill handler which does a - * consistency check on the information in c0_badvaddr and c0_entryhi and - * will just return and take the exception again if the information was - * found to be inconsistent. - */ -#ifndef BCM1250_M3_WAR -#error Check setting of BCM1250_M3_WAR for your platform -#endif - -/* - * This is a DUART workaround related to glitches around register accesses - */ -#ifndef SIBYTE_1956_WAR -#error Check setting of SIBYTE_1956_WAR for your platform -#endif - -/* - * Fill buffers not flushed on CACHE instructions - * - * Hit_Invalidate_I cacheops invalidate an icache line but the refill - * for that line can get stale data from the fill buffer instead of - * accessing memory if the previous icache miss was also to that line. - * - * Workaround: generate an icache refill from a different line - * - * Affects: - * MIPS 4K RTL revision <3.0, PRID revision <4 - */ -#ifndef MIPS4K_ICACHE_REFILL_WAR -#error Check setting of MIPS4K_ICACHE_REFILL_WAR for your platform -#endif - -/* - * Missing implicit forced flush of evictions caused by CACHE - * instruction - * - * Evictions caused by a CACHE instructions are not forced on to the - * bus. The BIU gives higher priority to fetches than to the data from - * the eviction buffer and no collision detection is performed between - * fetches and pending data from the eviction buffer. - * - * Workaround: Execute a SYNC instruction after the cache instruction - * - * Affects: - * MIPS 5Kc,5Kf RTL revision <2.3, PRID revision <8 - * MIPS 20Kc RTL revision <4.0, PRID revision <? - */ -#ifndef MIPS_CACHE_SYNC_WAR -#error Check setting of MIPS_CACHE_SYNC_WAR for your platform -#endif - -/* - * From TX49/H2 manual: "If the instruction (i.e. CACHE) is issued for - * the line which this instruction itself exists, the following - * operation is not guaranteed." - * - * Workaround: do two phase flushing for Index_Invalidate_I - */ -#ifndef TX49XX_ICACHE_INDEX_INV_WAR -#error Check setting of TX49XX_ICACHE_INDEX_INV_WAR for your platform -#endif - -/* - * The RM7000 processors and the E9000 cores have a bug (though PMC-Sierra - * opposes it being called that) where invalid instructions in the same - * I-cache line worth of instructions being fetched may case spurious - * exceptions. - */ -#ifndef ICACHE_REFILLS_WORKAROUND_WAR -#error Check setting of ICACHE_REFILLS_WORKAROUND_WAR for your platform -#endif - -/* - * On the R10000 up to version 2.6 (not sure about 2.7) there is a bug that - * may cause ll / sc and lld / scd sequences to execute non-atomically. - */ -#ifndef R10000_LLSC_WAR -#error Check setting of R10000_LLSC_WAR for your platform -#endif - -/* - * 34K core erratum: "Problems Executing the TLBR Instruction" - */ -#ifndef MIPS34K_MISSED_ITLB_WAR -#error Check setting of MIPS34K_MISSED_ITLB_WAR for your platform -#endif - #endif /* _ASM_WAR_H */ diff --git a/arch/mips/jz4740/Kconfig b/arch/mips/ingenic/Kconfig index c2a6fbf8e411..3238e16febd5 100644 --- a/arch/mips/jz4740/Kconfig +++ b/arch/mips/ingenic/Kconfig @@ -1,15 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 + +config MACH_INGENIC_GENERIC + bool + select MACH_INGENIC + select MACH_JZ4740 + select MACH_JZ4770 + select MACH_JZ4780 + select MACH_X1000 + choice prompt "Machine type" - depends on MACH_INGENIC + depends on MACH_INGENIC_SOC default INGENIC_GENERIC_BOARD config INGENIC_GENERIC_BOARD bool "Generic board" - select MACH_JZ4740 - select MACH_JZ4770 - select MACH_JZ4780 - select MACH_X1000 + select MACH_INGENIC_GENERIC config JZ4740_QI_LB60 bool "Qi Hardware Ben NanoNote" diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index 014773f0bfcd..461457b28982 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -16,8 +16,7 @@ #include <linux/memblock.h> #include <linux/spinlock.h> #include <linux/gfp.h> -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <asm/mipsregs.h> #include <asm/jazz.h> #include <asm/io.h> @@ -210,76 +209,6 @@ int vdma_free(unsigned long laddr) EXPORT_SYMBOL(vdma_free); /* - * Map certain page(s) to another physical address. - * Caller must have allocated the page(s) before. - */ -int vdma_remap(unsigned long laddr, unsigned long paddr, unsigned long size) -{ - int first, pages; - - if (laddr > 0xffffff) { - if (vdma_debug) - printk - ("vdma_map: Invalid logical address: %08lx\n", - laddr); - return -EINVAL; /* invalid logical address */ - } - if (paddr > 0x1fffffff) { - if (vdma_debug) - printk - ("vdma_map: Invalid physical address: %08lx\n", - paddr); - return -EINVAL; /* invalid physical address */ - } - - pages = (((paddr & (VDMA_PAGESIZE - 1)) + size) >> 12) + 1; - first = laddr >> 12; - if (vdma_debug) - printk("vdma_remap: first=%x, pages=%x\n", first, pages); - if (first + pages > VDMA_PGTBL_ENTRIES) { - if (vdma_debug) - printk("vdma_alloc: Invalid size: %08lx\n", size); - return -EINVAL; - } - - paddr &= ~(VDMA_PAGESIZE - 1); - while (pages > 0 && first < VDMA_PGTBL_ENTRIES) { - if (pgtbl[first].owner != laddr) { - if (vdma_debug) - printk("Trying to remap other's pages.\n"); - return -EPERM; /* not owner */ - } - pgtbl[first].frame = paddr; - paddr += VDMA_PAGESIZE; - first++; - pages--; - } - - /* - * Update translation table - */ - r4030_write_reg32(JAZZ_R4030_TRSTBL_INV, 0); - - if (vdma_debug > 2) { - int i; - pages = (((paddr & (VDMA_PAGESIZE - 1)) + size) >> 12) + 1; - first = laddr >> 12; - printk("LADDR: "); - for (i = first; i < first + pages; i++) - printk("%08x ", i << 12); - printk("\nPADDR: "); - for (i = first; i < first + pages; i++) - printk("%08x ", pgtbl[i].frame); - printk("\nOWNER: "); - for (i = first; i < first + pages; i++) - printk("%08x ", pgtbl[i].owner); - printk("\n"); - } - - return 0; -} - -/* * Translate a physical address to a logical address. * This will return the logical address of the first * match. @@ -562,26 +491,34 @@ int vdma_get_enable(int channel) static void *jazz_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + struct page *page; void *ret; - ret = dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); - if (!ret) - return NULL; + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; - *dma_handle = vdma_alloc(virt_to_phys(ret), size); - if (*dma_handle == DMA_MAPPING_ERROR) { - dma_direct_free_pages(dev, size, ret, *dma_handle, attrs); + size = PAGE_ALIGN(size); + page = alloc_pages(gfp, get_order(size)); + if (!page) return NULL; - } - - return ret; + ret = page_address(page); + memset(ret, 0, size); + *dma_handle = vdma_alloc(virt_to_phys(ret), size); + if (*dma_handle == DMA_MAPPING_ERROR) + goto out_free_pages; + arch_dma_prep_coherent(page, size); + return (void *)(UNCAC_BASE + __pa(ret)); + +out_free_pages: + __free_pages(page, get_order(size)); + return NULL; } static void jazz_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { vdma_free(dma_handle); - dma_direct_free_pages(dev, size, vaddr, dma_handle, attrs); + __free_pages(virt_to_page(vaddr), get_order(size)); } static dma_addr_t jazz_dma_map_page(struct device *dev, struct page *page, @@ -678,9 +615,9 @@ const struct dma_map_ops jazz_dma_ops = { .sync_single_for_device = jazz_dma_sync_single_for_device, .sync_sg_for_cpu = jazz_dma_sync_sg_for_cpu, .sync_sg_for_device = jazz_dma_sync_sg_for_device, - .dma_supported = dma_direct_supported, - .cache_sync = arch_dma_cache_sync, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(jazz_dma_ops); diff --git a/arch/mips/jz4740/Makefile b/arch/mips/jz4740/Makefile deleted file mode 100644 index f96c0f5eca44..000000000000 --- a/arch/mips/jz4740/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the Ingenic JZ4740. -# - -# Object file lists. -obj-y += setup.o - -CFLAGS_setup.o = -I$(src)/../../../scripts/dtc/libfdt diff --git a/arch/mips/jz4740/Platform b/arch/mips/jz4740/Platform deleted file mode 100644 index bd35d0621b13..000000000000 --- a/arch/mips/jz4740/Platform +++ /dev/null @@ -1,3 +0,0 @@ -cflags-$(CONFIG_MACH_INGENIC) += -I$(srctree)/arch/mips/include/asm/mach-jz4740 -load-$(CONFIG_MACH_INGENIC) += 0xffffffff80010000 -zload-$(CONFIG_MACH_INGENIC) += 0xffffffff81000000 diff --git a/arch/mips/jz4740/setup.c b/arch/mips/jz4740/setup.c deleted file mode 100644 index 51d906325ce6..000000000000 --- a/arch/mips/jz4740/setup.c +++ /dev/null @@ -1,145 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de> - * Copyright (C) 2011, Maarten ter Huurne <maarten@treewalker.org> - * JZ4740 setup code - */ - -#include <linux/clocksource.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/irqchip.h> -#include <linux/kernel.h> -#include <linux/libfdt.h> -#include <linux/of_clk.h> -#include <linux/of_fdt.h> -#include <linux/pm.h> -#include <linux/sizes.h> -#include <linux/suspend.h> - -#include <asm/bootinfo.h> -#include <asm/fw/fw.h> -#include <asm/prom.h> -#include <asm/reboot.h> -#include <asm/time.h> - -static unsigned long __init get_board_mach_type(const void *fdt) -{ - if (!fdt_node_check_compatible(fdt, 0, "ingenic,x2000")) - return MACH_INGENIC_X2000; - if (!fdt_node_check_compatible(fdt, 0, "ingenic,x1830")) - return MACH_INGENIC_X1830; - if (!fdt_node_check_compatible(fdt, 0, "ingenic,x1000")) - return MACH_INGENIC_X1000; - if (!fdt_node_check_compatible(fdt, 0, "ingenic,jz4780")) - return MACH_INGENIC_JZ4780; - if (!fdt_node_check_compatible(fdt, 0, "ingenic,jz4770")) - return MACH_INGENIC_JZ4770; - if (!fdt_node_check_compatible(fdt, 0, "ingenic,jz4725b")) - return MACH_INGENIC_JZ4725B; - - return MACH_INGENIC_JZ4740; -} - -void __init plat_mem_setup(void) -{ - void *dtb = (void *)fw_passed_dtb; - - __dt_setup_arch(dtb); - - /* - * Old devicetree files for the qi,lb60 board did not have a /memory - * node. Hardcode the memory info here. - */ - if (!fdt_node_check_compatible(dtb, 0, "qi,lb60") && - fdt_path_offset(dtb, "/memory") < 0) - early_init_dt_add_memory_arch(0, SZ_32M); - - mips_machtype = get_board_mach_type(dtb); -} - -void __init device_tree_init(void) -{ - if (!initial_boot_params) - return; - - unflatten_and_copy_device_tree(); -} - -const char *get_system_type(void) -{ - switch (mips_machtype) { - case MACH_INGENIC_X2000: - return "X2000"; - case MACH_INGENIC_X1830: - return "X1830"; - case MACH_INGENIC_X1000: - return "X1000"; - case MACH_INGENIC_JZ4780: - return "JZ4780"; - case MACH_INGENIC_JZ4770: - return "JZ4770"; - case MACH_INGENIC_JZ4725B: - return "JZ4725B"; - default: - return "JZ4740"; - } -} - -void __init arch_init_irq(void) -{ - irqchip_init(); -} - -void __init plat_time_init(void) -{ - of_clk_init(NULL); - timer_probe(); -} - -void __init prom_init(void) -{ - fw_init_cmdline(); -} - -void __init prom_free_prom_memory(void) -{ -} - -static void jz4740_wait_instr(void) -{ - __asm__(".set push;\n" - ".set mips3;\n" - "wait;\n" - ".set pop;\n" - ); -} - -static void jz4740_halt(void) -{ - for (;;) - jz4740_wait_instr(); -} - -static int __maybe_unused jz4740_pm_enter(suspend_state_t state) -{ - jz4740_wait_instr(); - - return 0; -} - -static const struct platform_suspend_ops jz4740_pm_ops __maybe_unused = { - .valid = suspend_valid_only_mem, - .enter = jz4740_pm_enter, -}; - -static int __init jz4740_pm_init(void) -{ - if (IS_ENABLED(CONFIG_PM_SLEEP)) - suspend_set_ops(&jz4740_pm_ops); - _machine_halt = jz4740_halt; - - return 0; - -} -late_initcall(jz4740_pm_init); diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index 13a26d254829..2a05b923f579 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -5,11 +5,17 @@ extra-y := head.o vmlinux.lds -obj-y += cmpxchg.o cpu-probe.o branch.o elf.o entry.o genex.o idle.o irq.o \ +obj-y += branch.o cmpxchg.o elf.o entry.o genex.o idle.o irq.o \ process.o prom.o ptrace.o reset.o setup.o signal.o \ syscall.o time.o topology.o traps.o unaligned.o watch.o \ vdso.o cacheinfo.o +ifdef CONFIG_CPU_R3K_TLB +obj-y += cpu-r3k-probe.o +else +obj-y += cpu-probe.o +endif + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg @@ -42,6 +48,7 @@ sw-$(CONFIG_CPU_TX39XX) := r2300_switch.o sw-$(CONFIG_CPU_CAVIUM_OCTEON) := octeon_switch.o obj-y += $(sw-y) +obj-$(CONFIG_MIPS_FP_SUPPORT) += fpu-probe.o obj-$(CONFIG_CPU_R2300_FPU) += r2300_fpu.o obj-$(CONFIG_CPU_R4K_FPU) += r4k_fpu.o diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index fb3e203698ea..0216ff24c392 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -20,6 +20,8 @@ #include <asm/ptrace.h> #include <linux/uaccess.h> +#include "probes-common.h" + /* * Calculate and return exception PC in case of branch delay slot * for microMIPS and MIPS16e. It does not clear the ISA mode bit. diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index e2955f1f6316..e6853697a056 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -28,336 +28,14 @@ #include <asm/spram.h> #include <linux/uaccess.h> +#include "fpu-probe.h" + #include <asm/mach-loongson64/cpucfg-emul.h> /* Hardware capabilities */ unsigned int elf_hwcap __read_mostly; EXPORT_SYMBOL_GPL(elf_hwcap); -#ifdef CONFIG_MIPS_FP_SUPPORT - -/* - * Get the FPU Implementation/Revision. - */ -static inline unsigned long cpu_get_fpu_id(void) -{ - unsigned long tmp, fpu_id; - - tmp = read_c0_status(); - __enable_fpu(FPU_AS_IS); - fpu_id = read_32bit_cp1_register(CP1_REVISION); - write_c0_status(tmp); - return fpu_id; -} - -/* - * Check if the CPU has an external FPU. - */ -static inline int __cpu_has_fpu(void) -{ - return (cpu_get_fpu_id() & FPIR_IMP_MASK) != FPIR_IMP_NONE; -} - -/* - * Determine the FCSR mask for FPU hardware. - */ -static inline void cpu_set_fpu_fcsr_mask(struct cpuinfo_mips *c) -{ - unsigned long sr, mask, fcsr, fcsr0, fcsr1; - - fcsr = c->fpu_csr31; - mask = FPU_CSR_ALL_X | FPU_CSR_ALL_E | FPU_CSR_ALL_S | FPU_CSR_RM; - - sr = read_c0_status(); - __enable_fpu(FPU_AS_IS); - - fcsr0 = fcsr & mask; - write_32bit_cp1_register(CP1_STATUS, fcsr0); - fcsr0 = read_32bit_cp1_register(CP1_STATUS); - - fcsr1 = fcsr | ~mask; - write_32bit_cp1_register(CP1_STATUS, fcsr1); - fcsr1 = read_32bit_cp1_register(CP1_STATUS); - - write_32bit_cp1_register(CP1_STATUS, fcsr); - - write_c0_status(sr); - - c->fpu_msk31 = ~(fcsr0 ^ fcsr1) & ~mask; -} - -/* - * Determine the IEEE 754 NaN encodings and ABS.fmt/NEG.fmt execution modes - * supported by FPU hardware. - */ -static void cpu_set_fpu_2008(struct cpuinfo_mips *c) -{ - if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | - MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | - MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | - MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { - unsigned long sr, fir, fcsr, fcsr0, fcsr1; - - sr = read_c0_status(); - __enable_fpu(FPU_AS_IS); - - fir = read_32bit_cp1_register(CP1_REVISION); - if (fir & MIPS_FPIR_HAS2008) { - fcsr = read_32bit_cp1_register(CP1_STATUS); - - /* - * MAC2008 toolchain never landed in real world, so we're only - * testing wether it can be disabled and don't try to enabled - * it. - */ - fcsr0 = fcsr & ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008 | FPU_CSR_MAC2008); - write_32bit_cp1_register(CP1_STATUS, fcsr0); - fcsr0 = read_32bit_cp1_register(CP1_STATUS); - - fcsr1 = fcsr | FPU_CSR_ABS2008 | FPU_CSR_NAN2008; - write_32bit_cp1_register(CP1_STATUS, fcsr1); - fcsr1 = read_32bit_cp1_register(CP1_STATUS); - - write_32bit_cp1_register(CP1_STATUS, fcsr); - - if (c->isa_level & (MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2)) { - /* - * The bit for MAC2008 might be reused by R6 in future, - * so we only test for R2-R5. - */ - if (fcsr0 & FPU_CSR_MAC2008) - c->options |= MIPS_CPU_MAC_2008_ONLY; - } - - if (!(fcsr0 & FPU_CSR_NAN2008)) - c->options |= MIPS_CPU_NAN_LEGACY; - if (fcsr1 & FPU_CSR_NAN2008) - c->options |= MIPS_CPU_NAN_2008; - - if ((fcsr0 ^ fcsr1) & FPU_CSR_ABS2008) - c->fpu_msk31 &= ~FPU_CSR_ABS2008; - else - c->fpu_csr31 |= fcsr & FPU_CSR_ABS2008; - - if ((fcsr0 ^ fcsr1) & FPU_CSR_NAN2008) - c->fpu_msk31 &= ~FPU_CSR_NAN2008; - else - c->fpu_csr31 |= fcsr & FPU_CSR_NAN2008; - } else { - c->options |= MIPS_CPU_NAN_LEGACY; - } - - write_c0_status(sr); - } else { - c->options |= MIPS_CPU_NAN_LEGACY; - } -} - -/* - * IEEE 754 conformance mode to use. Affects the NaN encoding and the - * ABS.fmt/NEG.fmt execution mode. - */ -static enum { STRICT, LEGACY, STD2008, RELAXED } ieee754 = STRICT; - -/* - * Set the IEEE 754 NaN encodings and the ABS.fmt/NEG.fmt execution modes - * to support by the FPU emulator according to the IEEE 754 conformance - * mode selected. Note that "relaxed" straps the emulator so that it - * allows 2008-NaN binaries even for legacy processors. - */ -static void cpu_set_nofpu_2008(struct cpuinfo_mips *c) -{ - c->options &= ~(MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY); - c->fpu_csr31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008); - c->fpu_msk31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008); - - switch (ieee754) { - case STRICT: - if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | - MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | - MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | - MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { - c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY; - } else { - c->options |= MIPS_CPU_NAN_LEGACY; - c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; - } - break; - case LEGACY: - c->options |= MIPS_CPU_NAN_LEGACY; - c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; - break; - case STD2008: - c->options |= MIPS_CPU_NAN_2008; - c->fpu_csr31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; - c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; - break; - case RELAXED: - c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY; - break; - } -} - -/* - * Override the IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode - * according to the "ieee754=" parameter. - */ -static void cpu_set_nan_2008(struct cpuinfo_mips *c) -{ - switch (ieee754) { - case STRICT: - mips_use_nan_legacy = !!cpu_has_nan_legacy; - mips_use_nan_2008 = !!cpu_has_nan_2008; - break; - case LEGACY: - mips_use_nan_legacy = !!cpu_has_nan_legacy; - mips_use_nan_2008 = !cpu_has_nan_legacy; - break; - case STD2008: - mips_use_nan_legacy = !cpu_has_nan_2008; - mips_use_nan_2008 = !!cpu_has_nan_2008; - break; - case RELAXED: - mips_use_nan_legacy = true; - mips_use_nan_2008 = true; - break; - } -} - -/* - * IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode override - * settings: - * - * strict: accept binaries that request a NaN encoding supported by the FPU - * legacy: only accept legacy-NaN binaries - * 2008: only accept 2008-NaN binaries - * relaxed: accept any binaries regardless of whether supported by the FPU - */ -static int __init ieee754_setup(char *s) -{ - if (!s) - return -1; - else if (!strcmp(s, "strict")) - ieee754 = STRICT; - else if (!strcmp(s, "legacy")) - ieee754 = LEGACY; - else if (!strcmp(s, "2008")) - ieee754 = STD2008; - else if (!strcmp(s, "relaxed")) - ieee754 = RELAXED; - else - return -1; - - if (!(boot_cpu_data.options & MIPS_CPU_FPU)) - cpu_set_nofpu_2008(&boot_cpu_data); - cpu_set_nan_2008(&boot_cpu_data); - - return 0; -} - -early_param("ieee754", ieee754_setup); - -/* - * Set the FIR feature flags for the FPU emulator. - */ -static void cpu_set_nofpu_id(struct cpuinfo_mips *c) -{ - u32 value; - - value = 0; - if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | - MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | - MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | - MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) - value |= MIPS_FPIR_D | MIPS_FPIR_S; - if (c->isa_level & (MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | - MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | - MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) - value |= MIPS_FPIR_F64 | MIPS_FPIR_L | MIPS_FPIR_W; - if (c->options & MIPS_CPU_NAN_2008) - value |= MIPS_FPIR_HAS2008; - c->fpu_id = value; -} - -/* Determined FPU emulator mask to use for the boot CPU with "nofpu". */ -static unsigned int mips_nofpu_msk31; - -/* - * Set options for FPU hardware. - */ -static void cpu_set_fpu_opts(struct cpuinfo_mips *c) -{ - c->fpu_id = cpu_get_fpu_id(); - mips_nofpu_msk31 = c->fpu_msk31; - - if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | - MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | - MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | - MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { - if (c->fpu_id & MIPS_FPIR_3D) - c->ases |= MIPS_ASE_MIPS3D; - if (c->fpu_id & MIPS_FPIR_UFRP) - c->options |= MIPS_CPU_UFR; - if (c->fpu_id & MIPS_FPIR_FREP) - c->options |= MIPS_CPU_FRE; - } - - cpu_set_fpu_fcsr_mask(c); - cpu_set_fpu_2008(c); - cpu_set_nan_2008(c); -} - -/* - * Set options for the FPU emulator. - */ -static void cpu_set_nofpu_opts(struct cpuinfo_mips *c) -{ - c->options &= ~MIPS_CPU_FPU; - c->fpu_msk31 = mips_nofpu_msk31; - - cpu_set_nofpu_2008(c); - cpu_set_nan_2008(c); - cpu_set_nofpu_id(c); -} - -static int mips_fpu_disabled; - -static int __init fpu_disable(char *s) -{ - cpu_set_nofpu_opts(&boot_cpu_data); - mips_fpu_disabled = 1; - - return 1; -} - -__setup("nofpu", fpu_disable); - -#else /* !CONFIG_MIPS_FP_SUPPORT */ - -#define mips_fpu_disabled 1 - -static inline unsigned long cpu_get_fpu_id(void) -{ - return FPIR_IMP_NONE; -} - -static inline int __cpu_has_fpu(void) -{ - return 0; -} - -static void cpu_set_fpu_opts(struct cpuinfo_mips *c) -{ - /* no-op */ -} - -static void cpu_set_nofpu_opts(struct cpuinfo_mips *c) -{ - /* no-op */ -} - -#endif /* CONFIG_MIPS_FP_SUPPORT */ - static inline unsigned long cpu_get_msa_id(void) { unsigned long status, msa_id; @@ -1600,8 +1278,9 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu) c->options = MIPS_CPU_TLB | MIPS_CPU_4K_CACHE | MIPS_CPU_4KEX | MIPS_CPU_FPU | MIPS_CPU_32FPR | MIPS_CPU_COUNTER | MIPS_CPU_WATCH | - MIPS_CPU_LLSC | MIPS_CPU_BP_GHIST; + MIPS_CPU_LLSC; c->tlbsize = 64; + write_c0_r10k_diag(read_c0_r10k_diag() | R10K_DIAG_E_GHIST); break; case PRID_IMP_R14000: if (((c->processor_id >> 4) & 0x0f) > 2) { @@ -1615,8 +1294,9 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu) c->options = MIPS_CPU_TLB | MIPS_CPU_4K_CACHE | MIPS_CPU_4KEX | MIPS_CPU_FPU | MIPS_CPU_32FPR | MIPS_CPU_COUNTER | MIPS_CPU_WATCH | - MIPS_CPU_LLSC | MIPS_CPU_BP_GHIST; + MIPS_CPU_LLSC; c->tlbsize = 64; + write_c0_r10k_diag(read_c0_r10k_diag() | R10K_DIAG_E_GHIST); break; case PRID_IMP_LOONGSON_64C: /* Loongson-2/3 */ switch (c->processor_id & PRID_REV_MASK) { @@ -2123,7 +1803,10 @@ static inline void cpu_probe_ingenic(struct cpuinfo_mips *c, unsigned int cpu) /* XBurst does not implement the CP0 counter. */ c->options &= ~MIPS_CPU_COUNTER; - BUG_ON(!__builtin_constant_p(cpu_has_counter) || cpu_has_counter); + BUG_ON(__builtin_constant_p(cpu_has_counter) && cpu_has_counter); + + /* XBurst has virtually tagged icache */ + c->icache.flags |= MIPS_CACHE_VTAG; switch (c->processor_id & PRID_IMP_MASK) { @@ -2169,8 +1852,9 @@ static inline void cpu_probe_ingenic(struct cpuinfo_mips *c, unsigned int cpu) /* XBurst®1 with MXU2.0 SIMD ISA */ case PRID_IMP_XBURST_REV2: + /* Ingenic uses the WA bit to achieve write-combine memory writes */ + c->writecombine = _CACHE_CACHABLE_WA; c->cputype = CPU_XBURST; - c->writecombine = _CACHE_UNCACHED_ACCELERATED; __cpu_name[cpu] = "Ingenic XBurst"; break; @@ -2372,10 +2056,6 @@ void cpu_probe(void) else cpu_set_nofpu_opts(c); - if (cpu_has_bp_ghist) - write_c0_r10k_diag(read_c0_r10k_diag() | - R10K_DIAG_E_GHIST); - if (cpu_has_mips_r2_r6) { c->srsets = ((read_c0_srsctl() >> 26) & 0x0f) + 1; /* R2 has Performance Counter Interrupt indicator */ diff --git a/arch/mips/kernel/cpu-r3k-probe.c b/arch/mips/kernel/cpu-r3k-probe.c new file mode 100644 index 000000000000..abdbbe8c5a43 --- /dev/null +++ b/arch/mips/kernel/cpu-r3k-probe.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Processor capabilities determination functions. + * + * Copyright (C) xxxx the Anonymous + * Copyright (C) 1994 - 2006 Ralf Baechle + * Copyright (C) 2003, 2004 Maciej W. Rozycki + * Copyright (C) 2001, 2004, 2011, 2012 MIPS Technologies, Inc. + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/ptrace.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/export.h> + +#include <asm/bugs.h> +#include <asm/cpu.h> +#include <asm/cpu-features.h> +#include <asm/cpu-type.h> +#include <asm/fpu.h> +#include <asm/mipsregs.h> +#include <asm/elf.h> + +#include "fpu-probe.h" + +/* Hardware capabilities */ +unsigned int elf_hwcap __read_mostly; +EXPORT_SYMBOL_GPL(elf_hwcap); + +void __init check_bugs32(void) +{ + +} + +/* + * Probe whether cpu has config register by trying to play with + * alternate cache bit and see whether it matters. + * It's used by cpu_probe to distinguish between R3000A and R3081. + */ +static inline int cpu_has_confreg(void) +{ +#ifdef CONFIG_CPU_R3000 + extern unsigned long r3k_cache_size(unsigned long); + unsigned long size1, size2; + unsigned long cfg = read_c0_conf(); + + size1 = r3k_cache_size(ST0_ISC); + write_c0_conf(cfg ^ R30XX_CONF_AC); + size2 = r3k_cache_size(ST0_ISC); + write_c0_conf(cfg); + return size1 != size2; +#else + return 0; +#endif +} + +static inline void set_elf_platform(int cpu, const char *plat) +{ + if (cpu == 0) + __elf_platform = plat; +} + +const char *__cpu_name[NR_CPUS]; +const char *__elf_platform; +const char *__elf_base_platform; + +void cpu_probe(void) +{ + struct cpuinfo_mips *c = ¤t_cpu_data; + unsigned int cpu = smp_processor_id(); + + /* + * Set a default elf platform, cpu probe may later + * overwrite it with a more precise value + */ + set_elf_platform(cpu, "mips"); + + c->processor_id = PRID_IMP_UNKNOWN; + c->fpu_id = FPIR_IMP_NONE; + c->cputype = CPU_UNKNOWN; + c->writecombine = _CACHE_UNCACHED; + + c->fpu_csr31 = FPU_CSR_RN; + c->fpu_msk31 = FPU_CSR_RSVD | FPU_CSR_ABS2008 | FPU_CSR_NAN2008 | + FPU_CSR_CONDX | FPU_CSR_FS; + + c->srsets = 1; + + c->processor_id = read_c0_prid(); + switch (c->processor_id & (PRID_COMP_MASK | PRID_IMP_MASK)) { + case PRID_COMP_LEGACY | PRID_IMP_R2000: + c->cputype = CPU_R2000; + __cpu_name[cpu] = "R2000"; + c->options = MIPS_CPU_TLB | MIPS_CPU_3K_CACHE | + MIPS_CPU_NOFPUEX; + if (__cpu_has_fpu()) + c->options |= MIPS_CPU_FPU; + c->tlbsize = 64; + break; + case PRID_COMP_LEGACY | PRID_IMP_R3000: + if ((c->processor_id & PRID_REV_MASK) == PRID_REV_R3000A) { + if (cpu_has_confreg()) { + c->cputype = CPU_R3081E; + __cpu_name[cpu] = "R3081"; + } else { + c->cputype = CPU_R3000A; + __cpu_name[cpu] = "R3000A"; + } + } else { + c->cputype = CPU_R3000; + __cpu_name[cpu] = "R3000"; + } + c->options = MIPS_CPU_TLB | MIPS_CPU_3K_CACHE | + MIPS_CPU_NOFPUEX; + if (__cpu_has_fpu()) + c->options |= MIPS_CPU_FPU; + c->tlbsize = 64; + break; + case PRID_COMP_LEGACY | PRID_IMP_TX39: + c->options = MIPS_CPU_TLB | MIPS_CPU_TX39_CACHE; + + if ((c->processor_id & 0xf0) == (PRID_REV_TX3927 & 0xf0)) { + c->cputype = CPU_TX3927; + __cpu_name[cpu] = "TX3927"; + c->tlbsize = 64; + } else { + switch (c->processor_id & PRID_REV_MASK) { + case PRID_REV_TX3912: + c->cputype = CPU_TX3912; + __cpu_name[cpu] = "TX3912"; + c->tlbsize = 32; + break; + case PRID_REV_TX3922: + c->cputype = CPU_TX3922; + __cpu_name[cpu] = "TX3922"; + c->tlbsize = 64; + break; + } + } + break; + } + + BUG_ON(!__cpu_name[cpu]); + BUG_ON(c->cputype == CPU_UNKNOWN); + + /* + * Platform code can force the cpu type to optimize code + * generation. In that case be sure the cpu type is correctly + * manually setup otherwise it could trigger some nasty bugs. + */ + BUG_ON(current_cpu_type() != c->cputype); + + if (mips_fpu_disabled) + c->options &= ~MIPS_CPU_FPU; + + if (c->options & MIPS_CPU_FPU) + cpu_set_fpu_opts(c); + else + cpu_set_nofpu_opts(c); +} + +void cpu_report(void) +{ + struct cpuinfo_mips *c = ¤t_cpu_data; + + pr_info("CPU%d revision is: %08x (%s)\n", + smp_processor_id(), c->processor_id, cpu_name_string()); + if (c->options & MIPS_CPU_FPU) + pr_info("FPU revision is: %08x\n", c->fpu_id); +} diff --git a/arch/mips/kernel/fpu-probe.c b/arch/mips/kernel/fpu-probe.c new file mode 100644 index 000000000000..e689d6a83234 --- /dev/null +++ b/arch/mips/kernel/fpu-probe.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Processor capabilities determination functions. + * + * Copyright (C) xxxx the Anonymous + * Copyright (C) 1994 - 2006 Ralf Baechle + * Copyright (C) 2003, 2004 Maciej W. Rozycki + * Copyright (C) 2001, 2004, 2011, 2012 MIPS Technologies, Inc. + */ + +#include <linux/init.h> +#include <linux/kernel.h> + +#include <asm/bugs.h> +#include <asm/cpu.h> +#include <asm/cpu-features.h> +#include <asm/cpu-type.h> +#include <asm/elf.h> +#include <asm/fpu.h> +#include <asm/mipsregs.h> + +#include "fpu-probe.h" + +/* + * Get the FPU Implementation/Revision. + */ +static inline unsigned long cpu_get_fpu_id(void) +{ + unsigned long tmp, fpu_id; + + tmp = read_c0_status(); + __enable_fpu(FPU_AS_IS); + fpu_id = read_32bit_cp1_register(CP1_REVISION); + write_c0_status(tmp); + return fpu_id; +} + +/* + * Check if the CPU has an external FPU. + */ +int __cpu_has_fpu(void) +{ + return (cpu_get_fpu_id() & FPIR_IMP_MASK) != FPIR_IMP_NONE; +} + +/* + * Determine the FCSR mask for FPU hardware. + */ +static inline void cpu_set_fpu_fcsr_mask(struct cpuinfo_mips *c) +{ + unsigned long sr, mask, fcsr, fcsr0, fcsr1; + + fcsr = c->fpu_csr31; + mask = FPU_CSR_ALL_X | FPU_CSR_ALL_E | FPU_CSR_ALL_S | FPU_CSR_RM; + + sr = read_c0_status(); + __enable_fpu(FPU_AS_IS); + + fcsr0 = fcsr & mask; + write_32bit_cp1_register(CP1_STATUS, fcsr0); + fcsr0 = read_32bit_cp1_register(CP1_STATUS); + + fcsr1 = fcsr | ~mask; + write_32bit_cp1_register(CP1_STATUS, fcsr1); + fcsr1 = read_32bit_cp1_register(CP1_STATUS); + + write_32bit_cp1_register(CP1_STATUS, fcsr); + + write_c0_status(sr); + + c->fpu_msk31 = ~(fcsr0 ^ fcsr1) & ~mask; +} + +/* + * Determine the IEEE 754 NaN encodings and ABS.fmt/NEG.fmt execution modes + * supported by FPU hardware. + */ +static void cpu_set_fpu_2008(struct cpuinfo_mips *c) +{ + if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | + MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | + MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | + MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { + unsigned long sr, fir, fcsr, fcsr0, fcsr1; + + sr = read_c0_status(); + __enable_fpu(FPU_AS_IS); + + fir = read_32bit_cp1_register(CP1_REVISION); + if (fir & MIPS_FPIR_HAS2008) { + fcsr = read_32bit_cp1_register(CP1_STATUS); + + /* + * MAC2008 toolchain never landed in real world, so + * we're only testing whether it can be disabled and + * don't try to enabled it. + */ + fcsr0 = fcsr & ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008 | + FPU_CSR_MAC2008); + write_32bit_cp1_register(CP1_STATUS, fcsr0); + fcsr0 = read_32bit_cp1_register(CP1_STATUS); + + fcsr1 = fcsr | FPU_CSR_ABS2008 | FPU_CSR_NAN2008; + write_32bit_cp1_register(CP1_STATUS, fcsr1); + fcsr1 = read_32bit_cp1_register(CP1_STATUS); + + write_32bit_cp1_register(CP1_STATUS, fcsr); + + if (c->isa_level & (MIPS_CPU_ISA_M32R2 | + MIPS_CPU_ISA_M64R2)) { + /* + * The bit for MAC2008 might be reused by R6 + * in future, so we only test for R2-R5. + */ + if (fcsr0 & FPU_CSR_MAC2008) + c->options |= MIPS_CPU_MAC_2008_ONLY; + } + + if (!(fcsr0 & FPU_CSR_NAN2008)) + c->options |= MIPS_CPU_NAN_LEGACY; + if (fcsr1 & FPU_CSR_NAN2008) + c->options |= MIPS_CPU_NAN_2008; + + if ((fcsr0 ^ fcsr1) & FPU_CSR_ABS2008) + c->fpu_msk31 &= ~FPU_CSR_ABS2008; + else + c->fpu_csr31 |= fcsr & FPU_CSR_ABS2008; + + if ((fcsr0 ^ fcsr1) & FPU_CSR_NAN2008) + c->fpu_msk31 &= ~FPU_CSR_NAN2008; + else + c->fpu_csr31 |= fcsr & FPU_CSR_NAN2008; + } else { + c->options |= MIPS_CPU_NAN_LEGACY; + } + + write_c0_status(sr); + } else { + c->options |= MIPS_CPU_NAN_LEGACY; + } +} + +/* + * IEEE 754 conformance mode to use. Affects the NaN encoding and the + * ABS.fmt/NEG.fmt execution mode. + */ +static enum { STRICT, LEGACY, STD2008, RELAXED } ieee754 = STRICT; + +/* + * Set the IEEE 754 NaN encodings and the ABS.fmt/NEG.fmt execution modes + * to support by the FPU emulator according to the IEEE 754 conformance + * mode selected. Note that "relaxed" straps the emulator so that it + * allows 2008-NaN binaries even for legacy processors. + */ +static void cpu_set_nofpu_2008(struct cpuinfo_mips *c) +{ + c->options &= ~(MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY); + c->fpu_csr31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008); + c->fpu_msk31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008); + + switch (ieee754) { + case STRICT: + if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | + MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | + MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | + MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { + c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY; + } else { + c->options |= MIPS_CPU_NAN_LEGACY; + c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; + } + break; + case LEGACY: + c->options |= MIPS_CPU_NAN_LEGACY; + c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; + break; + case STD2008: + c->options |= MIPS_CPU_NAN_2008; + c->fpu_csr31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; + c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008; + break; + case RELAXED: + c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY; + break; + } +} + +/* + * Override the IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode + * according to the "ieee754=" parameter. + */ +static void cpu_set_nan_2008(struct cpuinfo_mips *c) +{ + switch (ieee754) { + case STRICT: + mips_use_nan_legacy = !!cpu_has_nan_legacy; + mips_use_nan_2008 = !!cpu_has_nan_2008; + break; + case LEGACY: + mips_use_nan_legacy = !!cpu_has_nan_legacy; + mips_use_nan_2008 = !cpu_has_nan_legacy; + break; + case STD2008: + mips_use_nan_legacy = !cpu_has_nan_2008; + mips_use_nan_2008 = !!cpu_has_nan_2008; + break; + case RELAXED: + mips_use_nan_legacy = true; + mips_use_nan_2008 = true; + break; + } +} + +/* + * IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode override + * settings: + * + * strict: accept binaries that request a NaN encoding supported by the FPU + * legacy: only accept legacy-NaN binaries + * 2008: only accept 2008-NaN binaries + * relaxed: accept any binaries regardless of whether supported by the FPU + */ +static int __init ieee754_setup(char *s) +{ + if (!s) + return -1; + else if (!strcmp(s, "strict")) + ieee754 = STRICT; + else if (!strcmp(s, "legacy")) + ieee754 = LEGACY; + else if (!strcmp(s, "2008")) + ieee754 = STD2008; + else if (!strcmp(s, "relaxed")) + ieee754 = RELAXED; + else + return -1; + + if (!(boot_cpu_data.options & MIPS_CPU_FPU)) + cpu_set_nofpu_2008(&boot_cpu_data); + cpu_set_nan_2008(&boot_cpu_data); + + return 0; +} + +early_param("ieee754", ieee754_setup); + +/* + * Set the FIR feature flags for the FPU emulator. + */ +static void cpu_set_nofpu_id(struct cpuinfo_mips *c) +{ + u32 value; + + value = 0; + if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | + MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | + MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | + MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) + value |= MIPS_FPIR_D | MIPS_FPIR_S; + if (c->isa_level & (MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | + MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | + MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) + value |= MIPS_FPIR_F64 | MIPS_FPIR_L | MIPS_FPIR_W; + if (c->options & MIPS_CPU_NAN_2008) + value |= MIPS_FPIR_HAS2008; + c->fpu_id = value; +} + +/* Determined FPU emulator mask to use for the boot CPU with "nofpu". */ +static unsigned int mips_nofpu_msk31; + +/* + * Set options for FPU hardware. + */ +void cpu_set_fpu_opts(struct cpuinfo_mips *c) +{ + c->fpu_id = cpu_get_fpu_id(); + mips_nofpu_msk31 = c->fpu_msk31; + + if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 | + MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 | + MIPS_CPU_ISA_M32R5 | MIPS_CPU_ISA_M64R5 | + MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { + if (c->fpu_id & MIPS_FPIR_3D) + c->ases |= MIPS_ASE_MIPS3D; + if (c->fpu_id & MIPS_FPIR_UFRP) + c->options |= MIPS_CPU_UFR; + if (c->fpu_id & MIPS_FPIR_FREP) + c->options |= MIPS_CPU_FRE; + } + + cpu_set_fpu_fcsr_mask(c); + cpu_set_fpu_2008(c); + cpu_set_nan_2008(c); +} + +/* + * Set options for the FPU emulator. + */ +void cpu_set_nofpu_opts(struct cpuinfo_mips *c) +{ + c->options &= ~MIPS_CPU_FPU; + c->fpu_msk31 = mips_nofpu_msk31; + + cpu_set_nofpu_2008(c); + cpu_set_nan_2008(c); + cpu_set_nofpu_id(c); +} + +int mips_fpu_disabled; + +static int __init fpu_disable(char *s) +{ + cpu_set_nofpu_opts(&boot_cpu_data); + mips_fpu_disabled = 1; + + return 1; +} + +__setup("nofpu", fpu_disable); + diff --git a/arch/mips/kernel/fpu-probe.h b/arch/mips/kernel/fpu-probe.h new file mode 100644 index 000000000000..951ce50890d0 --- /dev/null +++ b/arch/mips/kernel/fpu-probe.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include <linux/kernel.h> + +#include <asm/cpu.h> +#include <asm/cpu-info.h> + +#ifdef CONFIG_MIPS_FP_SUPPORT + +extern int mips_fpu_disabled; + +int __cpu_has_fpu(void); +void cpu_set_fpu_opts(struct cpuinfo_mips *c); +void cpu_set_nofpu_opts(struct cpuinfo_mips *c); + +#else /* !CONFIG_MIPS_FP_SUPPORT */ + +#define mips_fpu_disabled 1 + +static inline unsigned long cpu_get_fpu_id(void) +{ + return FPIR_IMP_NONE; +} + +static inline int __cpu_has_fpu(void) +{ + return 0; +} + +static inline void cpu_set_fpu_opts(struct cpuinfo_mips *c) +{ + /* no-op */ +} + +static inline void cpu_set_nofpu_opts(struct cpuinfo_mips *c) +{ + /* no-op */ +} + +#endif /* CONFIG_MIPS_FP_SUPPORT */ diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 2625232bfe52..f57e68f40a34 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -37,10 +37,6 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -#endif - -#ifdef CONFIG_DYNAMIC_FTRACE - #define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */ #define ADDR_MASK 0x03ffffff /* op_code|addr : 31...26|25 ....0 */ #define JUMP_RANGE_MASK ((1UL << 28) - 1) diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S index 7dd234e788e6..61b73580b877 100644 --- a/arch/mips/kernel/head.S +++ b/arch/mips/kernel/head.S @@ -35,7 +35,7 @@ .macro setup_c0_status set clr .set push mfc0 t0, CP0_STATUS - or t0, ST0_CU0|\set|0x1f|\clr + or t0, ST0_KERNEL_CUMASK|\set|0x1f|\clr xor t0, 0x1f|\clr mtc0 t0, CP0_STATUS .set noreorder diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 1a08428eedcf..6c590ef27648 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -167,7 +167,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, return -EINVAL; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -181,7 +181,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, cpumask_and(&mask, &allowed, cpu_active_mask); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); if (retval) return retval; diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index f5dc316a826a..75ebd8d7bd5d 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -52,6 +52,7 @@ #include <asm/inst.h> #include <asm/stacktrace.h> #include <asm/irq_regs.h> +#include <asm/exec.h> #ifdef CONFIG_HOTPLUG_CPU void arch_cpu_idle_dead(void) @@ -68,7 +69,7 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp) unsigned long status; /* New thread loses kernel privileges. */ - status = regs->cp0_status & ~(ST0_CU0|ST0_CU1|ST0_FR|KU_MASK); + status = regs->cp0_status & ~(ST0_CU0|ST0_CU1|ST0_CU2|ST0_FR|KU_MASK); status |= KU_USER; regs->cp0_status = status; lose_fpu(0); @@ -133,7 +134,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, childregs = (struct pt_regs *) childksp - 1; /* Put the stack after the struct pt_regs. */ childksp = (unsigned long) childregs; - p->thread.cp0_status = read_c0_status() & ~(ST0_CU2|ST0_CU1); + p->thread.cp0_status = (read_c0_status() & ~(ST0_CU2|ST0_CU1)) | ST0_KERNEL_CUMASK; if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ unsigned long status = p->thread.cp0_status; @@ -279,7 +280,21 @@ static inline int is_ra_save_ins(union mips_instruction *ip, int *poff) *poff = ip->i_format.simmediate / sizeof(ulong); return 1; } - +#ifdef CONFIG_CPU_LOONGSON64 + if ((ip->loongson3_lswc2_format.opcode == swc2_op) && + (ip->loongson3_lswc2_format.ls == 1) && + (ip->loongson3_lswc2_format.fr == 0) && + (ip->loongson3_lswc2_format.base == 29)) { + if (ip->loongson3_lswc2_format.rt == 31) { + *poff = ip->loongson3_lswc2_format.offset << 1; + return 1; + } + if (ip->loongson3_lswc2_format.rq == 31) { + *poff = (ip->loongson3_lswc2_format.offset << 1) + 1; + return 1; + } + } +#endif return 0; #endif } diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c index 9e50dc8df2f6..6abebd57b218 100644 --- a/arch/mips/kernel/prom.c +++ b/arch/mips/kernel/prom.c @@ -36,31 +36,6 @@ char *mips_get_machine_name(void) } #ifdef CONFIG_USE_OF -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - if (base >= PHYS_ADDR_MAX) { - pr_warn("Trying to add an invalid memory region, skipped\n"); - return; - } - - /* Truncate the passed memory region instead of type casting */ - if (base + size - 1 >= PHYS_ADDR_MAX || base + size < base) { - pr_warn("Truncate memory region %llx @ %llx to size %llx\n", - size, base, PHYS_ADDR_MAX - base); - size = PHYS_ADDR_MAX - base; - } - - add_memory_region(base, size, BOOT_MEM_RAM); -} - -int __init early_init_dt_reserve_memory_arch(phys_addr_t base, - phys_addr_t size, bool nomap) -{ - add_memory_region(base, size, - nomap ? BOOT_MEM_NOMAP : BOOT_MEM_RESERVED); - - return 0; -} void __init __dt_setup_arch(void *bph) { diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 335bd188b8b4..fccdbe2e7c2b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -24,7 +24,7 @@ #include <linux/kexec.h> #include <linux/sizes.h> #include <linux/device.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/decompress/generic.h> #include <linux/of_fdt.h> #include <linux/of_reserved_mem.h> @@ -91,45 +91,6 @@ unsigned long ARCH_PFN_OFFSET; EXPORT_SYMBOL(ARCH_PFN_OFFSET); #endif -void __init add_memory_region(phys_addr_t start, phys_addr_t size, long type) -{ - /* - * Note: This function only exists for historical reason, - * new code should use memblock_add or memblock_add_node instead. - */ - - /* - * If the region reaches the top of the physical address space, adjust - * the size slightly so that (start + size) doesn't overflow - */ - if (start + size - 1 == PHYS_ADDR_MAX) - --size; - - /* Sanity check */ - if (start + size < start) { - pr_warn("Trying to add an invalid memory region, skipped\n"); - return; - } - - if (start < PHYS_OFFSET) - return; - - memblock_add(start, size); - /* Reserve any memory except the ordinary RAM ranges. */ - switch (type) { - case BOOT_MEM_RAM: - break; - - case BOOT_MEM_NOMAP: /* Discard the range from the system. */ - memblock_remove(start, size); - break; - - default: /* Reserve the rest of the memory types at boot time */ - memblock_reserve(start, size); - break; - } -} - void __init detect_memory_region(phys_addr_t start, phys_addr_t sz_min, phys_addr_t sz_max) { void *dm = &detect_magic; @@ -146,7 +107,7 @@ void __init detect_memory_region(phys_addr_t start, phys_addr_t sz_min, phys_add ((unsigned long long) sz_min) / SZ_1M, ((unsigned long long) sz_max) / SZ_1M); - add_memory_region(start, size, BOOT_MEM_RAM); + memblock_add(start, size); } /* @@ -396,7 +357,7 @@ static int __init early_parse_mem(char *p) if (*p == '@') start = memparse(p + 1, &p); - add_memory_region(start, size, BOOT_MEM_RAM); + memblock_add(start, size); return 0; } @@ -422,13 +383,14 @@ static int __init early_parse_memmap(char *p) if (*p == '@') { start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, BOOT_MEM_RAM); + memblock_add(start_at, mem_size); } else if (*p == '#') { pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on MIPS\n"); return -EINVAL; } else if (*p == '$') { start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, BOOT_MEM_RESERVED); + memblock_add(start_at, mem_size); + memblock_reserve(start_at, mem_size); } else { pr_err("\"memmap\" invalid format!\n"); return -EINVAL; @@ -443,7 +405,7 @@ static int __init early_parse_memmap(char *p) early_param("memmap", early_parse_memmap); #ifdef CONFIG_PROC_VMCORE -unsigned long setup_elfcorehdr, setup_elfcorehdr_size; +static unsigned long setup_elfcorehdr, setup_elfcorehdr_size; static int __init early_parse_elfcorehdr(char *p) { phys_addr_t start, end; @@ -472,6 +434,11 @@ early_param("elfcorehdr", early_parse_elfcorehdr); #endif #ifdef CONFIG_KEXEC + +/* 64M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_64M +#define CRASH_ADDR_MAX SZ_512M + static void __init mips_parse_crashkernel(void) { unsigned long long total_mem; @@ -484,9 +451,22 @@ static void __init mips_parse_crashkernel(void) if (ret != 0 || crash_size <= 0) return; - if (!memblock_find_in_range(crash_base, crash_base + crash_size, crash_size, 1)) { - pr_warn("Invalid memory region reserved for crash kernel\n"); - return; + if (crash_base <= 0) { + crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX, + crash_size, CRASH_ALIGN); + if (!crash_base) { + pr_warn("crashkernel reservation failed - No suitable area found.\n"); + return; + } + } else { + unsigned long long start; + + start = memblock_find_in_range(crash_base, crash_base + crash_size, + crash_size, 1); + if (start != crash_base) { + pr_warn("Invalid memory region reserved for crash kernel\n"); + return; + } } crashk_res.start = crash_base; @@ -621,7 +601,7 @@ static void __init bootcmdline_init(void) * arch_mem_init - initialize memory management subsystem * * o plat_mem_setup() detects the memory configuration and will record detected - * memory areas using add_memory_region. + * memory areas using memblock_add. * * At this stage the memory configuration of the system is known to the * kernel but generic memory management system is still entirely uninitialized. diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index a0262729cd4c..50d0515bea21 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -545,6 +545,12 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) return err ?: protected_restore_fp_context(sc); } +#ifdef CONFIG_WAR_ICACHE_REFILLS +#define SIGMASK ~(cpu_icache_line_size()-1) +#else +#define SIGMASK ALMASK +#endif + void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size) { @@ -565,7 +571,7 @@ void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, sp = sigsp(sp, ksig); - return (void __user *)((sp - frame_size) & (ICACHE_REFILLS_WORKAROUND_WAR ? ~(cpu_icache_line_size()-1) : ALMASK)); + return (void __user *)((sp - frame_size) & SIGMASK); } /* @@ -901,7 +907,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index c333e5788664..2afa3eef486a 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -106,7 +106,7 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new) if (unlikely(!access_ok((const void __user *)addr, 4))) return -EINVAL; - if (cpu_has_llsc && R10000_LLSC_WAR) { + if (cpu_has_llsc && IS_ENABLED(CONFIG_WAR_R10000_LLSC)) { __asm__ __volatile__ ( " .set push \n" " .set arch=r4000 \n" diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index cf72a0206a87..32817c954435 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -378,3 +378,4 @@ 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 +440 n32 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 557f9954a2b9..9e4ea3c31b1c 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -354,3 +354,4 @@ 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 +440 n64 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index a17aab5abeb2..29f5f28cf5ce 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -427,3 +427,4 @@ 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 +440 o32 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index cf788591f091..e0352958e2f7 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -2204,7 +2204,7 @@ static void configure_status(void) * flag that some firmware may have left set and the TS bit (for * IP27). Set XX for ISA IV code to work. */ - unsigned int status_set = ST0_CU0; + unsigned int status_set = ST0_KERNEL_CUMASK; #ifdef CONFIG_64BIT status_set |= ST0_FR|ST0_KX|ST0_SX|ST0_UX; #endif diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index fd716942e302..832475bf2055 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -205,7 +205,7 @@ static inline void build_set_exc_base(u32 **p, unsigned int reg) * Assemble the start of the vcpu_run function to run a guest VCPU. The function * conforms to the following prototype: * - * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); + * int vcpu_run(struct kvm_vcpu *vcpu); * * The exit from the guest and return to the caller is handled by the code * generated by kvm_mips_build_ret_to_host(). @@ -218,8 +218,7 @@ void *kvm_mips_build_vcpu_run(void *addr) unsigned int i; /* - * A0: run - * A1: vcpu + * A0: vcpu */ /* k0/k1 not being used in host kernel context */ @@ -238,10 +237,10 @@ void *kvm_mips_build_vcpu_run(void *addr) kvm_mips_build_save_scratch(&p, V1, K1); /* VCPU scratch register has pointer to vcpu */ - UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]); /* Offset into vcpu->arch */ - UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch)); /* * Save the host stack to VCPU, used for exception processing @@ -645,10 +644,7 @@ void *kvm_mips_build_exit(void *addr) /* Now that context has been saved, we can use other registers */ /* Restore vcpu */ - UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); - - /* Restore run (vcpu->run) */ - UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1); + UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]); /* * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process @@ -810,7 +806,6 @@ void *kvm_mips_build_exit(void *addr) * with this in the kernel */ uasm_i_move(&p, A0, S0); - uasm_i_move(&p, A1, S1); UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); uasm_i_jalr(&p, RA, T9); UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ); @@ -852,7 +847,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr) * guest, reload k1 */ - uasm_i_move(&p, K1, S1); + uasm_i_move(&p, K1, S0); UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); /* @@ -886,8 +881,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr) { u32 *p = addr; - /* Put the saved pointer to vcpu (s1) back into the scratch register */ - UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); + /* Put the saved pointer to vcpu (s0) back into the scratch register */ + UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]); /* Load up the Guest EBASE to minimize the window where BEV is set */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 0c50ac444222..3d6a7f5827b1 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1199,8 +1199,9 @@ static void kvm_mips_set_c0_status(void) /* * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) */ -int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; u32 __user *opc = (u32 __user *) vcpu->arch.pc; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index f8cba51e1054..0788c00d7e94 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -1241,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu) */ kvm_mips_suspend_mm(cpu); - r = vcpu->arch.vcpu_run(vcpu->run, vcpu); + r = vcpu->arch.vcpu_run(vcpu); /* We may have migrated while handling guest exits */ cpu = smp_processor_id(); diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index c299e5d6d69c..2ffbe9264a31 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3266,7 +3266,7 @@ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_tlb(vcpu, cpu); kvm_vz_vcpu_load_wired(vcpu); - r = vcpu->arch.vcpu_run(vcpu->run, vcpu); + r = vcpu->arch.vcpu_run(vcpu); kvm_vz_vcpu_save_wired(vcpu); diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index b10342018d19..917fac1636b7 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -112,11 +112,15 @@ static u32 pmu_clk_cr_b[] = { #define PMU_PPE_DP BIT(23) #define PMU_PPE_DPLUS BIT(24) #define PMU_USB1_P BIT(26) +#define PMU_GPHY3 BIT(26) /* grx390 */ #define PMU_USB1 BIT(27) #define PMU_SWITCH BIT(28) #define PMU_PPE_TOP BIT(29) +#define PMU_GPHY0 BIT(29) /* ar10, xrx390 */ #define PMU_GPHY BIT(30) +#define PMU_GPHY1 BIT(30) /* ar10, xrx390 */ #define PMU_PCIE_CLK BIT(31) +#define PMU_GPHY2 BIT(31) /* ar10, xrx390 */ #define PMU1_PCIE_PHY BIT(0) /* vr9-specific,moved in ar10/grx390 */ #define PMU1_PCIE_CTL BIT(1) @@ -465,6 +469,9 @@ void __init ltq_soc_init(void) if (of_machine_is_compatible("lantiq,grx390") || of_machine_is_compatible("lantiq,ar10")) { + clkdev_add_pmu("1e108000.switch", "gphy0", 0, 0, PMU_GPHY0); + clkdev_add_pmu("1e108000.switch", "gphy1", 0, 0, PMU_GPHY1); + clkdev_add_pmu("1e108000.switch", "gphy2", 0, 0, PMU_GPHY2); clkdev_add_pmu("1f203018.usb2-phy", "phy", 1, 2, PMU_ANALOG_USB0_P); clkdev_add_pmu("1f203034.usb2-phy", "phy", 1, 2, PMU_ANALOG_USB1_P); /* rc 0 */ @@ -496,6 +503,7 @@ void __init ltq_soc_init(void) } else if (of_machine_is_compatible("lantiq,grx390")) { clkdev_add_static(ltq_grx390_cpu_hz(), ltq_grx390_fpi_hz(), ltq_grx390_fpi_hz(), ltq_grx390_pp32_hz()); + clkdev_add_pmu("1e108000.switch", "gphy3", 0, 0, PMU_GPHY3); clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0); clkdev_add_pmu("1e106000.usb", "otg", 1, 0, PMU_USB1); /* rc 2 */ @@ -514,8 +522,6 @@ void __init ltq_soc_init(void) clkdev_add_pmu("1e10b308.eth", NULL, 0, 0, PMU_SWITCH | PMU_PPE_DP | PMU_PPE_TC); clkdev_add_pmu("1da00000.usif", "NULL", 1, 0, PMU_USIF); - clkdev_add_pmu("1e108000.switch", "gphy0", 0, 0, PMU_GPHY); - clkdev_add_pmu("1e108000.switch", "gphy1", 0, 0, PMU_GPHY); clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU); clkdev_add_pmu("1e116000.mei", "afe", 1, 2, PMU_ANALOG_DSL_AFE); clkdev_add_pmu("1e116000.mei", "dfe", 1, 0, PMU_DFE); diff --git a/arch/mips/loongson2ef/common/mem.c b/arch/mips/loongson2ef/common/mem.c index ae21f1c62baa..057d58bb470e 100644 --- a/arch/mips/loongson2ef/common/mem.c +++ b/arch/mips/loongson2ef/common/mem.c @@ -17,10 +17,7 @@ u32 memsize, highmemsize; void __init prom_init_memory(void) { - add_memory_region(0x0, (memsize << 20), BOOT_MEM_RAM); - - add_memory_region(memsize << 20, LOONGSON_PCI_MEM_START - (memsize << - 20), BOOT_MEM_RESERVED); + memblock_add(0x0, (memsize << 20)); #ifdef CONFIG_CPU_SUPPORTS_ADDRWINCFG { @@ -41,12 +38,7 @@ void __init prom_init_memory(void) #ifdef CONFIG_64BIT if (highmemsize > 0) - add_memory_region(LOONGSON_HIGHMEM_START, - highmemsize << 20, BOOT_MEM_RAM); - - add_memory_region(LOONGSON_PCI_MEM_END + 1, LOONGSON_HIGHMEM_START - - LOONGSON_PCI_MEM_END - 1, BOOT_MEM_RESERVED); - + memblock_add(LOONGSON_HIGHMEM_START, highmemsize << 20); #endif /* !CONFIG_64BIT */ } diff --git a/arch/mips/loongson2ef/fuloong-2e/dma.c b/arch/mips/loongson2ef/fuloong-2e/dma.c index e122292bf666..cea167d8aba8 100644 --- a/arch/mips/loongson2ef/fuloong-2e/dma.c +++ b/arch/mips/loongson2ef/fuloong-2e/dma.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/dma-direct.h> -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { return paddr | 0x80000000; } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { return dma_addr & 0x7fffffff; } diff --git a/arch/mips/loongson2ef/lemote-2f/dma.c b/arch/mips/loongson2ef/lemote-2f/dma.c index abf0e39d7e46..3c9e99456357 100644 --- a/arch/mips/loongson2ef/lemote-2f/dma.c +++ b/arch/mips/loongson2ef/lemote-2f/dma.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/dma-direct.h> -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { return paddr | 0x80000000; } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { if (dma_addr > 0x8fffffff) return dma_addr; diff --git a/arch/mips/loongson32/common/prom.c b/arch/mips/loongson32/common/prom.c index fd76114fa3b0..c133b5adf34e 100644 --- a/arch/mips/loongson32/common/prom.c +++ b/arch/mips/loongson32/common/prom.c @@ -7,8 +7,8 @@ #include <linux/io.h> #include <linux/init.h> +#include <linux/memblock.h> #include <linux/serial_reg.h> -#include <asm/bootinfo.h> #include <asm/fw/fw.h> #include <loongson1.h> @@ -42,5 +42,5 @@ void __init prom_free_prom_memory(void) void __init plat_mem_setup(void) { - add_memory_region(0x0, (memsize << 20), BOOT_MEM_RAM); + memblock_add(0x0, (memsize << 20)); } diff --git a/arch/mips/loongson64/dma.c b/arch/mips/loongson64/dma.c index dbfe6e82fddd..364f2f27c872 100644 --- a/arch/mips/loongson64/dma.c +++ b/arch/mips/loongson64/dma.c @@ -4,7 +4,7 @@ #include <linux/swiotlb.h> #include <boot_param.h> -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { /* We extract 2bit node id (bit 44~47, only bit 44~45 used now) from * Loongson-3's 48bit address space and embed it into 40bit */ @@ -13,7 +13,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) return ((nid << 44) ^ paddr) | (nid << node_id_offset); } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { /* We extract 2bit node id (bit 44~47, only bit 44~45 used now) from * Loongson-3's 48bit address space and embed it into 40bit */ diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index ea8bb1bc667e..cf9459f79f9b 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -98,27 +98,6 @@ static void __init init_topology_matrix(void) } } -static unsigned long nid_to_addroffset(unsigned int nid) -{ - unsigned long result; - switch (nid) { - case 0: - default: - result = NODE0_ADDRSPACE_OFFSET; - break; - case 1: - result = NODE1_ADDRSPACE_OFFSET; - break; - case 2: - result = NODE2_ADDRSPACE_OFFSET; - break; - case 3: - result = NODE3_ADDRSPACE_OFFSET; - break; - } - return result; -} - static void __init szmem(unsigned int node) { u32 i, mem_type; @@ -146,7 +125,7 @@ static void __init szmem(unsigned int node) pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); memblock_add_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), node); + PFN_PHYS(node_psize), node); break; case SYSTEM_RAM_HIGH: start_pfn = ((node_id << 44) + mem_start) >> PAGE_SHIFT; @@ -158,7 +137,7 @@ static void __init szmem(unsigned int node) pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); memblock_add_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), node); + PFN_PHYS(node_psize), node); break; case SYSTEM_RAM_RESERVED: pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n", @@ -175,7 +154,7 @@ static void __init node_mem_init(unsigned int node) unsigned long node_addrspace_offset; unsigned long start_pfn, end_pfn; - node_addrspace_offset = nid_to_addroffset(node); + node_addrspace_offset = nid_to_addrbase(node); pr_info("Node%d's addrspace_offset is 0x%lx\n", node, node_addrspace_offset); @@ -242,9 +221,7 @@ void __init paging_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, }; pagetable_init(); -#ifdef CONFIG_ZONE_DMA32 zones_size[ZONE_DMA32] = MAX_DMA32_PFN; -#endif zones_size[ZONE_NORMAL] = max_low_pfn; free_area_init(zones_size); } diff --git a/arch/mips/loongson64/reset.c b/arch/mips/loongson64/reset.c index bc7671079f0c..3bb8a1ed9348 100644 --- a/arch/mips/loongson64/reset.c +++ b/arch/mips/loongson64/reset.c @@ -15,11 +15,6 @@ #include <loongson.h> #include <boot_param.h> -static inline void loongson_reboot(void) -{ - ((void (*)(void))ioremap(LOONGSON_BOOT_BASE, 4)) (); -} - static void loongson_restart(char *command) { diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 0ef717093262..9cede7ce37e6 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -130,9 +130,10 @@ struct bcache_ops *bcops = &no_sc_ops; #define R4600_HIT_CACHEOP_WAR_IMPL \ do { \ - if (R4600_V2_HIT_CACHEOP_WAR && cpu_is_r4600_v2_x()) \ + if (IS_ENABLED(CONFIG_WAR_R4600_V2_HIT_CACHEOP) && \ + cpu_is_r4600_v2_x()) \ *(volatile unsigned long *)CKSEG1; \ - if (R4600_V1_HIT_CACHEOP_WAR) \ + if (IS_ENABLED(CONFIG_WAR_R4600_V1_HIT_CACHEOP)) \ __asm__ __volatile__("nop;nop;nop;nop"); \ } while (0) @@ -238,7 +239,7 @@ static void r4k_blast_dcache_setup(void) r4k_blast_dcache = blast_dcache128; } -/* force code alignment (used for TX49XX_ICACHE_INDEX_INV_WAR) */ +/* force code alignment (used for CONFIG_WAR_TX49XX_ICACHE_INDEX_INV) */ #define JUMP_TO_ALIGN(order) \ __asm__ __volatile__( \ "b\t1f\n\t" \ @@ -366,10 +367,11 @@ static void r4k_blast_icache_page_indexed_setup(void) else if (ic_lsize == 16) r4k_blast_icache_page_indexed = blast_icache16_page_indexed; else if (ic_lsize == 32) { - if (R4600_V1_INDEX_ICACHEOP_WAR && cpu_is_r4600_v1_x()) + if (IS_ENABLED(CONFIG_WAR_R4600_V1_INDEX_ICACHEOP) && + cpu_is_r4600_v1_x()) r4k_blast_icache_page_indexed = blast_icache32_r4600_v1_page_indexed; - else if (TX49XX_ICACHE_INDEX_INV_WAR) + else if (IS_ENABLED(CONFIG_WAR_TX49XX_ICACHE_INDEX_INV)) r4k_blast_icache_page_indexed = tx49_blast_icache32_page_indexed; else if (current_cpu_type() == CPU_LOONGSON2EF) @@ -394,9 +396,10 @@ static void r4k_blast_icache_setup(void) else if (ic_lsize == 16) r4k_blast_icache = blast_icache16; else if (ic_lsize == 32) { - if (R4600_V1_INDEX_ICACHEOP_WAR && cpu_is_r4600_v1_x()) + if (IS_ENABLED(CONFIG_WAR_R4600_V1_INDEX_ICACHEOP) && + cpu_is_r4600_v1_x()) r4k_blast_icache = blast_r4600_v1_icache32; - else if (TX49XX_ICACHE_INDEX_INV_WAR) + else if (IS_ENABLED(CONFIG_WAR_TX49XX_ICACHE_INDEX_INV)) r4k_blast_icache = tx49_blast_icache32; else if (current_cpu_type() == CPU_LOONGSON2EF) r4k_blast_icache = loongson2_blast_icache32; diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index 563c2c0d0c81..38d3d9143b47 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -5,8 +5,7 @@ * swiped from i386, and cloned for MIPS by Geert, polished by Ralf. */ #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/highmem.h> #include <asm/cache.h> @@ -55,22 +54,34 @@ void *arch_dma_set_uncached(void *addr, size_t size) return (void *)(__pa(addr) + UNCAC_BASE); } -static inline void dma_sync_virt(void *addr, size_t size, +static inline void dma_sync_virt_for_device(void *addr, size_t size, enum dma_data_direction dir) { switch (dir) { case DMA_TO_DEVICE: dma_cache_wback((unsigned long)addr, size); break; - case DMA_FROM_DEVICE: dma_cache_inv((unsigned long)addr, size); break; - case DMA_BIDIRECTIONAL: dma_cache_wback_inv((unsigned long)addr, size); break; + default: + BUG(); + } +} +static inline void dma_sync_virt_for_cpu(void *addr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_TO_DEVICE: + break; + case DMA_FROM_DEVICE: + case DMA_BIDIRECTIONAL: + dma_cache_inv((unsigned long)addr, size); + break; default: BUG(); } @@ -82,7 +93,7 @@ static inline void dma_sync_virt(void *addr, size_t size, * configured then the bulk of this loop gets optimized out. */ static inline void dma_sync_phys(phys_addr_t paddr, size_t size, - enum dma_data_direction dir) + enum dma_data_direction dir, bool for_device) { struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); unsigned long offset = paddr & ~PAGE_MASK; @@ -90,18 +101,20 @@ static inline void dma_sync_phys(phys_addr_t paddr, size_t size, do { size_t len = left; + void *addr; if (PageHighMem(page)) { - void *addr; - if (offset + len > PAGE_SIZE) len = PAGE_SIZE - offset; + } + + addr = kmap_atomic(page); + if (for_device) + dma_sync_virt_for_device(addr + offset, len, dir); + else + dma_sync_virt_for_cpu(addr + offset, len, dir); + kunmap_atomic(addr); - addr = kmap_atomic(page); - dma_sync_virt(addr + offset, len, dir); - kunmap_atomic(addr); - } else - dma_sync_virt(page_address(page) + offset, size, dir); offset = 0; page++; left -= len; @@ -111,7 +124,7 @@ static inline void dma_sync_phys(phys_addr_t paddr, size_t size, void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { - dma_sync_phys(paddr, size, dir); + dma_sync_phys(paddr, size, dir, true); } #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU @@ -119,18 +132,10 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { if (cpu_needs_post_dma_flush()) - dma_sync_phys(paddr, size, dir); + dma_sync_phys(paddr, size, dir, false); } #endif -void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - - dma_sync_virt(vaddr, size, direction); -} - #ifdef CONFIG_DMA_PERDEV_COHERENT void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) diff --git a/arch/mips/mm/page.c b/arch/mips/mm/page.c index cd805b005509..504bc4047c4c 100644 --- a/arch/mips/mm/page.c +++ b/arch/mips/mm/page.c @@ -250,14 +250,16 @@ static inline void build_clear_pref(u32 **buf, int off) if (cpu_has_cache_cdex_s) { uasm_i_cache(buf, Create_Dirty_Excl_SD, off, A0); } else if (cpu_has_cache_cdex_p) { - if (R4600_V1_HIT_CACHEOP_WAR && cpu_is_r4600_v1_x()) { + if (IS_ENABLED(CONFIG_WAR_R4600_V1_HIT_CACHEOP) && + cpu_is_r4600_v1_x()) { uasm_i_nop(buf); uasm_i_nop(buf); uasm_i_nop(buf); uasm_i_nop(buf); } - if (R4600_V2_HIT_CACHEOP_WAR && cpu_is_r4600_v2_x()) + if (IS_ENABLED(CONFIG_WAR_R4600_V2_HIT_CACHEOP) && + cpu_is_r4600_v2_x()) uasm_i_lw(buf, ZERO, ZERO, AT); uasm_i_cache(buf, Create_Dirty_Excl_D, off, A0); @@ -302,7 +304,7 @@ void build_clear_page(void) else uasm_i_ori(&buf, A2, A0, off); - if (R4600_V2_HIT_CACHEOP_WAR && cpu_is_r4600_v2_x()) + if (IS_ENABLED(CONFIG_WAR_R4600_V2_HIT_CACHEOP) && cpu_is_r4600_v2_x()) uasm_i_lui(&buf, AT, uasm_rel_hi(0xa0000000)); off = cache_line_size ? min(8, pref_bias_clear_store / cache_line_size) @@ -402,14 +404,16 @@ static inline void build_copy_store_pref(u32 **buf, int off) if (cpu_has_cache_cdex_s) { uasm_i_cache(buf, Create_Dirty_Excl_SD, off, A0); } else if (cpu_has_cache_cdex_p) { - if (R4600_V1_HIT_CACHEOP_WAR && cpu_is_r4600_v1_x()) { + if (IS_ENABLED(CONFIG_WAR_R4600_V1_HIT_CACHEOP) && + cpu_is_r4600_v1_x()) { uasm_i_nop(buf); uasm_i_nop(buf); uasm_i_nop(buf); uasm_i_nop(buf); } - if (R4600_V2_HIT_CACHEOP_WAR && cpu_is_r4600_v2_x()) + if (IS_ENABLED(CONFIG_WAR_R4600_V2_HIT_CACHEOP) && + cpu_is_r4600_v2_x()) uasm_i_lw(buf, ZERO, ZERO, AT); uasm_i_cache(buf, Create_Dirty_Excl_D, off, A0); @@ -453,7 +457,7 @@ void build_copy_page(void) else uasm_i_ori(&buf, A2, A0, off); - if (R4600_V2_HIT_CACHEOP_WAR && cpu_is_r4600_v2_x()) + if (IS_ENABLED(CONFIG_WAR_R4600_V2_HIT_CACHEOP) && cpu_is_r4600_v2_x()) uasm_i_lui(&buf, AT, uasm_rel_hi(0xa0000000)); off = cache_line_size ? min(8, pref_bias_copy_load / cache_line_size) * diff --git a/arch/mips/mm/sc-mips.c b/arch/mips/mm/sc-mips.c index 97dc0511e63f..dd0a5becaabd 100644 --- a/arch/mips/mm/sc-mips.c +++ b/arch/mips/mm/sc-mips.c @@ -228,6 +228,7 @@ static inline int __init mips_sc_probe(void) * contradicted by all documentation. */ case MACH_INGENIC_JZ4770: + case MACH_INGENIC_JZ4775: c->scache.ways = 4; break; @@ -236,6 +237,7 @@ static inline int __init mips_sc_probe(void) * but that is contradicted by all documentation. */ case MACH_INGENIC_X1000: + case MACH_INGENIC_X1000E: c->scache.sets = 256; c->scache.ways = 4; break; diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 14f8ba93367f..a7521b8f7658 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -83,14 +83,18 @@ static inline int r4k_250MHZhwbug(void) return 0; } +extern int sb1250_m3_workaround_needed(void); + static inline int __maybe_unused bcm1250_m3_war(void) { - return BCM1250_M3_WAR; + if (IS_ENABLED(CONFIG_SB1_PASS_2_WORKAROUNDS)) + return sb1250_m3_workaround_needed(); + return 0; } static inline int __maybe_unused r10000_llsc_war(void) { - return R10000_LLSC_WAR; + return IS_ENABLED(CONFIG_WAR_R10000_LLSC); } static int use_bbit_insns(void) diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index c56f129c9a4b..81dd226d6b6b 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -394,7 +394,7 @@ I_u2u1u3(_lddir) void uasm_i_pref(u32 **buf, unsigned int a, signed int b, unsigned int c) { - if (CAVIUM_OCTEON_DCACHE_PREFETCH_WAR && a <= 24 && a != 5) + if (OCTEON_IS_MODEL(OCTEON_CN6XXX) && a <= 24 && a != 5) /* * As per erratum Core-14449, replace prefetches 0-4, * 6-24 with 'pref 28'. diff --git a/arch/mips/mti-malta/malta-setup.c b/arch/mips/mti-malta/malta-setup.c index c4ad5a9b4bc1..e1fb8b534944 100644 --- a/arch/mips/mti-malta/malta-setup.c +++ b/arch/mips/mti-malta/malta-setup.c @@ -16,7 +16,6 @@ #include <asm/dma-coherence.h> #include <asm/fw/fw.h> -#include <asm/mach-malta/malta-dtshim.h> #include <asm/mips-cps.h> #include <asm/mips-boards/generic.h> #include <asm/mips-boards/malta.h> diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c index 6e3102bcd2f1..9adc0c1b4ffc 100644 --- a/arch/mips/netlogic/xlp/setup.c +++ b/arch/mips/netlogic/xlp/setup.c @@ -89,7 +89,7 @@ static void __init xlp_init_mem_from_bars(void) if (map[i] > 0x10000000 && map[i] < 0x20000000) map[i] = 0x20000000; - add_memory_region(map[i], map[i+1] - map[i], BOOT_MEM_RAM); + memblock_add(map[i], map[i+1] - map[i]); } } diff --git a/arch/mips/netlogic/xlr/setup.c b/arch/mips/netlogic/xlr/setup.c index 72ceddc9a03f..627e88101316 100644 --- a/arch/mips/netlogic/xlr/setup.c +++ b/arch/mips/netlogic/xlr/setup.c @@ -34,6 +34,7 @@ #include <linux/kernel.h> #include <linux/serial_8250.h> +#include <linux/memblock.h> #include <linux/pm.h> #include <asm/idle.h> @@ -149,7 +150,7 @@ static void prom_add_memory(void) bootm = (void *)(long)nlm_prom_info.psb_mem_map; for (i = 0; i < bootm->nr_map; i++) { - if (bootm->map[i].type != BOOT_MEM_RAM) + if (bootm->map[i].type != NLM_BOOT_MEM_RAM) continue; start = bootm->map[i].addr; size = bootm->map[i].size; @@ -158,7 +159,7 @@ static void prom_add_memory(void) if (i == 0 && start == 0 && size == 0x0c000000) size = 0x0ff00000; - add_memory_region(start, size - pref_backup, BOOT_MEM_RAM); + memblock_add(start, size - pref_backup); } } diff --git a/arch/mips/pci/pci-ar2315.c b/arch/mips/pci/pci-ar2315.c index 490953f51528..0b15730cef88 100644 --- a/arch/mips/pci/pci-ar2315.c +++ b/arch/mips/pci/pci-ar2315.c @@ -170,12 +170,12 @@ static inline dma_addr_t ar2315_dev_offset(struct device *dev) return 0; } -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { return paddr + ar2315_dev_offset(dev); } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { return dma_addr - ar2315_dev_offset(dev); } @@ -423,9 +423,8 @@ static int ar2315_pci_probe(struct platform_device *pdev) return -EINVAL; apc->irq = irq; - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, - "ar2315-pci-ctrl"); - apc->mmr_mem = devm_ioremap_resource(dev, res); + apc->mmr_mem = devm_platform_ioremap_resource_byname(pdev, + "ar2315-pci-ctrl"); if (IS_ERR(apc->mmr_mem)) return PTR_ERR(apc->mmr_mem); diff --git a/arch/mips/pci/pci-ar71xx.c b/arch/mips/pci/pci-ar71xx.c index a9f8e7c881bd..118760b3fa82 100644 --- a/arch/mips/pci/pci-ar71xx.c +++ b/arch/mips/pci/pci-ar71xx.c @@ -336,8 +336,8 @@ static int ar71xx_pci_probe(struct platform_device *pdev) if (!apc) return -ENOMEM; - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg_base"); - apc->cfg_base = devm_ioremap_resource(&pdev->dev, res); + apc->cfg_base = devm_platform_ioremap_resource_byname(pdev, + "cfg_base"); if (IS_ERR(apc->cfg_base)) return PTR_ERR(apc->cfg_base); diff --git a/arch/mips/pci/pci-ar724x.c b/arch/mips/pci/pci-ar724x.c index 869d5c9a2f8d..807558b251ef 100644 --- a/arch/mips/pci/pci-ar724x.c +++ b/arch/mips/pci/pci-ar724x.c @@ -372,18 +372,15 @@ static int ar724x_pci_probe(struct platform_device *pdev) if (!apc) return -ENOMEM; - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctrl_base"); - apc->ctrl_base = devm_ioremap_resource(&pdev->dev, res); + apc->ctrl_base = devm_platform_ioremap_resource_byname(pdev, "ctrl_base"); if (IS_ERR(apc->ctrl_base)) return PTR_ERR(apc->ctrl_base); - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg_base"); - apc->devcfg_base = devm_ioremap_resource(&pdev->dev, res); + apc->devcfg_base = devm_platform_ioremap_resource_byname(pdev, "cfg_base"); if (IS_ERR(apc->devcfg_base)) return PTR_ERR(apc->devcfg_base); - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "crp_base"); - apc->crp_base = devm_ioremap_resource(&pdev->dev, res); + apc->crp_base = devm_platform_ioremap_resource_byname(pdev, "crp_base"); if (IS_ERR(apc->crp_base)) return PTR_ERR(apc->crp_base); diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c index 9b3cc775c55e..50f7d42cca5a 100644 --- a/arch/mips/pci/pci-xtalk-bridge.c +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -25,7 +25,7 @@ /* * Common phys<->dma mapping for platforms using pci xtalk bridge */ -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { struct pci_dev *pdev = to_pci_dev(dev); struct bridge_controller *bc = BRIDGE_CONTROLLER(pdev->bus); @@ -33,7 +33,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) return bc->baddr + paddr; } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { return dma_addr & ~(0xffUL << 56); } diff --git a/arch/mips/pnx833x/Makefile b/arch/mips/pnx833x/Makefile deleted file mode 100644 index 927268a58237..000000000000 --- a/arch/mips/pnx833x/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_SOC_PNX833X) += common/ -obj-$(CONFIG_NXP_STB220) += stb22x/ -obj-$(CONFIG_NXP_STB225) += stb22x/ diff --git a/arch/mips/pnx833x/Platform b/arch/mips/pnx833x/Platform deleted file mode 100644 index e5286a49fc3e..000000000000 --- a/arch/mips/pnx833x/Platform +++ /dev/null @@ -1,4 +0,0 @@ -# NXP STB225 -cflags-$(CONFIG_SOC_PNX833X) += -I$(srctree)/arch/mips/include/asm/mach-pnx833x -load-$(CONFIG_NXP_STB220) += 0xffffffff80001000 -load-$(CONFIG_NXP_STB225) += 0xffffffff80001000 diff --git a/arch/mips/pnx833x/common/Makefile b/arch/mips/pnx833x/common/Makefile deleted file mode 100644 index 9b4d394112b0..000000000000 --- a/arch/mips/pnx833x/common/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-y := interrupts.o platform.o prom.o setup.o reset.o diff --git a/arch/mips/pnx833x/common/interrupts.c b/arch/mips/pnx833x/common/interrupts.c deleted file mode 100644 index 2fbbabcac386..000000000000 --- a/arch/mips/pnx833x/common/interrupts.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * interrupts.c: Interrupt mappings for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - */ -#include <linux/kernel.h> -#include <linux/irq.h> -#include <linux/hardirq.h> -#include <linux/interrupt.h> -#include <asm/mipsregs.h> -#include <asm/irq_cpu.h> -#include <asm/setup.h> -#include <irq.h> -#include <irq-mapping.h> -#include <gpio.h> - -static int mips_cpu_timer_irq; - -static const unsigned int irq_prio[PNX833X_PIC_NUM_IRQ] = -{ - 0, /* unused */ - 4, /* PNX833X_PIC_I2C0_INT 1 */ - 4, /* PNX833X_PIC_I2C1_INT 2 */ - 1, /* PNX833X_PIC_UART0_INT 3 */ - 1, /* PNX833X_PIC_UART1_INT 4 */ - 6, /* PNX833X_PIC_TS_IN0_DV_INT 5 */ - 6, /* PNX833X_PIC_TS_IN0_DMA_INT 6 */ - 7, /* PNX833X_PIC_GPIO_INT 7 */ - 4, /* PNX833X_PIC_AUDIO_DEC_INT 8 */ - 5, /* PNX833X_PIC_VIDEO_DEC_INT 9 */ - 4, /* PNX833X_PIC_CONFIG_INT 10 */ - 4, /* PNX833X_PIC_AOI_INT 11 */ - 9, /* PNX833X_PIC_SYNC_INT 12 */ - 9, /* PNX8335_PIC_SATA_INT 13 */ - 4, /* PNX833X_PIC_OSD_INT 14 */ - 9, /* PNX833X_PIC_DISP1_INT 15 */ - 4, /* PNX833X_PIC_DEINTERLACER_INT 16 */ - 9, /* PNX833X_PIC_DISPLAY2_INT 17 */ - 4, /* PNX833X_PIC_VC_INT 18 */ - 4, /* PNX833X_PIC_SC_INT 19 */ - 9, /* PNX833X_PIC_IDE_INT 20 */ - 9, /* PNX833X_PIC_IDE_DMA_INT 21 */ - 6, /* PNX833X_PIC_TS_IN1_DV_INT 22 */ - 6, /* PNX833X_PIC_TS_IN1_DMA_INT 23 */ - 4, /* PNX833X_PIC_SGDX_DMA_INT 24 */ - 4, /* PNX833X_PIC_TS_OUT_INT 25 */ - 4, /* PNX833X_PIC_IR_INT 26 */ - 3, /* PNX833X_PIC_VMSP1_INT 27 */ - 3, /* PNX833X_PIC_VMSP2_INT 28 */ - 4, /* PNX833X_PIC_PIBC_INT 29 */ - 4, /* PNX833X_PIC_TS_IN0_TRD_INT 30 */ - 4, /* PNX833X_PIC_SGDX_TPD_INT 31 */ - 5, /* PNX833X_PIC_USB_INT 32 */ - 4, /* PNX833X_PIC_TS_IN1_TRD_INT 33 */ - 4, /* PNX833X_PIC_CLOCK_INT 34 */ - 4, /* PNX833X_PIC_SGDX_PARSER_INT 35 */ - 4, /* PNX833X_PIC_VMSP_DMA_INT 36 */ -#if defined(CONFIG_SOC_PNX8335) - 4, /* PNX8335_PIC_MIU_INT 37 */ - 4, /* PNX8335_PIC_AVCHIP_IRQ_INT 38 */ - 9, /* PNX8335_PIC_SYNC_HD_INT 39 */ - 9, /* PNX8335_PIC_DISP_HD_INT 40 */ - 9, /* PNX8335_PIC_DISP_SCALER_INT 41 */ - 4, /* PNX8335_PIC_OSD_HD1_INT 42 */ - 4, /* PNX8335_PIC_DTL_WRITER_Y_INT 43 */ - 4, /* PNX8335_PIC_DTL_WRITER_C_INT 44 */ - 4, /* PNX8335_PIC_DTL_EMULATOR_Y_IR_INT 45 */ - 4, /* PNX8335_PIC_DTL_EMULATOR_C_IR_INT 46 */ - 4, /* PNX8335_PIC_DENC_TTX_INT 47 */ - 4, /* PNX8335_PIC_MMI_SIF0_INT 48 */ - 4, /* PNX8335_PIC_MMI_SIF1_INT 49 */ - 4, /* PNX8335_PIC_MMI_CDMMU_INT 50 */ - 4, /* PNX8335_PIC_PIBCS_INT 51 */ - 12, /* PNX8335_PIC_ETHERNET_INT 52 */ - 3, /* PNX8335_PIC_VMSP1_0_INT 53 */ - 3, /* PNX8335_PIC_VMSP1_1_INT 54 */ - 4, /* PNX8335_PIC_VMSP1_DMA_INT 55 */ - 4, /* PNX8335_PIC_TDGR_DE_INT 56 */ - 4, /* PNX8335_PIC_IR1_IRQ_INT 57 */ -#endif -}; - -static void pnx833x_timer_dispatch(void) -{ - do_IRQ(mips_cpu_timer_irq); -} - -static void pic_dispatch(void) -{ - unsigned int irq = PNX833X_REGFIELD(PIC_INT_SRC, INT_SRC); - - if ((irq >= 1) && (irq < (PNX833X_PIC_NUM_IRQ))) { - unsigned long priority = PNX833X_PIC_INT_PRIORITY; - PNX833X_PIC_INT_PRIORITY = irq_prio[irq]; - - if (irq == PNX833X_PIC_GPIO_INT) { - unsigned long mask = PNX833X_PIO_INT_STATUS & PNX833X_PIO_INT_ENABLE; - int pin; - while ((pin = ffs(mask & 0xffff))) { - pin -= 1; - do_IRQ(PNX833X_GPIO_IRQ_BASE + pin); - mask &= ~(1 << pin); - } - } else { - do_IRQ(irq + PNX833X_PIC_IRQ_BASE); - } - - PNX833X_PIC_INT_PRIORITY = priority; - } else { - printk(KERN_ERR "plat_irq_dispatch: unexpected irq %u\n", irq); - } -} - -asmlinkage void plat_irq_dispatch(void) -{ - unsigned int pending = read_c0_status() & read_c0_cause(); - - if (pending & STATUSF_IP4) - pic_dispatch(); - else if (pending & STATUSF_IP7) - do_IRQ(PNX833X_TIMER_IRQ); - else - spurious_interrupt(); -} - -static inline void pnx833x_hard_enable_pic_irq(unsigned int irq) -{ - /* Currently we do this by setting IRQ priority to 1. - If priority support is being implemented, 1 should be repalced - by a better value. */ - PNX833X_PIC_INT_REG(irq) = irq_prio[irq]; -} - -static inline void pnx833x_hard_disable_pic_irq(unsigned int irq) -{ - /* Disable IRQ by writing setting it's priority to 0 */ - PNX833X_PIC_INT_REG(irq) = 0; -} - -static DEFINE_RAW_SPINLOCK(pnx833x_irq_lock); - -static unsigned int pnx833x_startup_pic_irq(unsigned int irq) -{ - unsigned long flags; - unsigned int pic_irq = irq - PNX833X_PIC_IRQ_BASE; - - raw_spin_lock_irqsave(&pnx833x_irq_lock, flags); - pnx833x_hard_enable_pic_irq(pic_irq); - raw_spin_unlock_irqrestore(&pnx833x_irq_lock, flags); - return 0; -} - -static void pnx833x_enable_pic_irq(struct irq_data *d) -{ - unsigned long flags; - unsigned int pic_irq = d->irq - PNX833X_PIC_IRQ_BASE; - - raw_spin_lock_irqsave(&pnx833x_irq_lock, flags); - pnx833x_hard_enable_pic_irq(pic_irq); - raw_spin_unlock_irqrestore(&pnx833x_irq_lock, flags); -} - -static void pnx833x_disable_pic_irq(struct irq_data *d) -{ - unsigned long flags; - unsigned int pic_irq = d->irq - PNX833X_PIC_IRQ_BASE; - - raw_spin_lock_irqsave(&pnx833x_irq_lock, flags); - pnx833x_hard_disable_pic_irq(pic_irq); - raw_spin_unlock_irqrestore(&pnx833x_irq_lock, flags); -} - -static DEFINE_RAW_SPINLOCK(pnx833x_gpio_pnx833x_irq_lock); - -static void pnx833x_enable_gpio_irq(struct irq_data *d) -{ - int pin = d->irq - PNX833X_GPIO_IRQ_BASE; - unsigned long flags; - raw_spin_lock_irqsave(&pnx833x_gpio_pnx833x_irq_lock, flags); - pnx833x_gpio_enable_irq(pin); - raw_spin_unlock_irqrestore(&pnx833x_gpio_pnx833x_irq_lock, flags); -} - -static void pnx833x_disable_gpio_irq(struct irq_data *d) -{ - int pin = d->irq - PNX833X_GPIO_IRQ_BASE; - unsigned long flags; - raw_spin_lock_irqsave(&pnx833x_gpio_pnx833x_irq_lock, flags); - pnx833x_gpio_disable_irq(pin); - raw_spin_unlock_irqrestore(&pnx833x_gpio_pnx833x_irq_lock, flags); -} - -static int pnx833x_set_type_gpio_irq(struct irq_data *d, unsigned int flow_type) -{ - int pin = d->irq - PNX833X_GPIO_IRQ_BASE; - int gpio_mode; - - switch (flow_type) { - case IRQ_TYPE_EDGE_RISING: - gpio_mode = GPIO_INT_EDGE_RISING; - break; - case IRQ_TYPE_EDGE_FALLING: - gpio_mode = GPIO_INT_EDGE_FALLING; - break; - case IRQ_TYPE_EDGE_BOTH: - gpio_mode = GPIO_INT_EDGE_BOTH; - break; - case IRQ_TYPE_LEVEL_HIGH: - gpio_mode = GPIO_INT_LEVEL_HIGH; - break; - case IRQ_TYPE_LEVEL_LOW: - gpio_mode = GPIO_INT_LEVEL_LOW; - break; - default: - gpio_mode = GPIO_INT_NONE; - break; - } - - pnx833x_gpio_setup_irq(gpio_mode, pin); - - return 0; -} - -static struct irq_chip pnx833x_pic_irq_type = { - .name = "PNX-PIC", - .irq_enable = pnx833x_enable_pic_irq, - .irq_disable = pnx833x_disable_pic_irq, -}; - -static struct irq_chip pnx833x_gpio_irq_type = { - .name = "PNX-GPIO", - .irq_enable = pnx833x_enable_gpio_irq, - .irq_disable = pnx833x_disable_gpio_irq, - .irq_set_type = pnx833x_set_type_gpio_irq, -}; - -void __init arch_init_irq(void) -{ - unsigned int irq; - - /* setup standard internal cpu irqs */ - mips_cpu_irq_init(); - - /* Set IRQ information in irq_desc */ - for (irq = PNX833X_PIC_IRQ_BASE; irq < (PNX833X_PIC_IRQ_BASE + PNX833X_PIC_NUM_IRQ); irq++) { - pnx833x_hard_disable_pic_irq(irq); - irq_set_chip_and_handler(irq, &pnx833x_pic_irq_type, - handle_simple_irq); - } - - for (irq = PNX833X_GPIO_IRQ_BASE; irq < (PNX833X_GPIO_IRQ_BASE + PNX833X_GPIO_NUM_IRQ); irq++) - irq_set_chip_and_handler(irq, &pnx833x_gpio_irq_type, - handle_simple_irq); - - /* Set PIC priority limiter register to 0 */ - PNX833X_PIC_INT_PRIORITY = 0; - - /* Setup GPIO IRQ dispatching */ - pnx833x_startup_pic_irq(PNX833X_PIC_GPIO_INT); - - /* Enable PIC IRQs (HWIRQ2) */ - if (cpu_has_vint) - set_vi_handler(4, pic_dispatch); - - write_c0_status(read_c0_status() | IE_IRQ2); -} - -unsigned int get_c0_compare_int(void) -{ - if (cpu_has_vint) - set_vi_handler(cp0_compare_irq, pnx833x_timer_dispatch); - - mips_cpu_timer_irq = MIPS_CPU_IRQ_BASE + cp0_compare_irq; - return mips_cpu_timer_irq; -} - -void __init plat_time_init(void) -{ - /* calculate mips_hpt_frequency based on PNX833X_CLOCK_CPUCP_CTL reg */ - - extern unsigned long mips_hpt_frequency; - unsigned long reg = PNX833X_CLOCK_CPUCP_CTL; - - if (!(PNX833X_BIT(reg, CLOCK_CPUCP_CTL, EXIT_RESET))) { - /* Functional clock is disabled so use crystal frequency */ - mips_hpt_frequency = 25; - } else { -#if defined(CONFIG_SOC_PNX8335) - /* Functional clock is enabled, so get clock multiplier */ - mips_hpt_frequency = 90 + (10 * PNX8335_REGFIELD(CLOCK_PLL_CPU_CTL, FREQ)); -#else - static const unsigned long int freq[4] = {240, 160, 120, 80}; - mips_hpt_frequency = freq[PNX833X_FIELD(reg, CLOCK_CPUCP_CTL, DIV_CLOCK)]; -#endif - } - - printk(KERN_INFO "CPU clock is %ld MHz\n", mips_hpt_frequency); - - mips_hpt_frequency *= 500000; -} diff --git a/arch/mips/pnx833x/common/platform.c b/arch/mips/pnx833x/common/platform.c deleted file mode 100644 index 5fa0373f1c9e..000000000000 --- a/arch/mips/pnx833x/common/platform.c +++ /dev/null @@ -1,224 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * platform.c: platform support for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - * - * Based on software written by: - * Nikita Youshchenko <yoush@debian.org>, based on PNX8550 code. - */ -#include <linux/device.h> -#include <linux/dma-mapping.h> -#include <linux/platform_device.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/resource.h> -#include <linux/serial.h> -#include <linux/serial_pnx8xxx.h> -#include <linux/mtd/platnand.h> - -#include <irq.h> -#include <irq-mapping.h> -#include <pnx833x.h> - -static u64 uart_dmamask = DMA_BIT_MASK(32); - -static struct resource pnx833x_uart_resources[] = { - [0] = { - .start = PNX833X_UART0_PORTS_START, - .end = PNX833X_UART0_PORTS_END, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = PNX833X_PIC_UART0_INT, - .end = PNX833X_PIC_UART0_INT, - .flags = IORESOURCE_IRQ, - }, - [2] = { - .start = PNX833X_UART1_PORTS_START, - .end = PNX833X_UART1_PORTS_END, - .flags = IORESOURCE_MEM, - }, - [3] = { - .start = PNX833X_PIC_UART1_INT, - .end = PNX833X_PIC_UART1_INT, - .flags = IORESOURCE_IRQ, - }, -}; - -struct pnx8xxx_port pnx8xxx_ports[] = { - [0] = { - .port = { - .type = PORT_PNX8XXX, - .iotype = UPIO_MEM, - .membase = (void __iomem *)PNX833X_UART0_PORTS_START, - .mapbase = PNX833X_UART0_PORTS_START, - .irq = PNX833X_PIC_UART0_INT, - .uartclk = 3692300, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .line = 0, - }, - }, - [1] = { - .port = { - .type = PORT_PNX8XXX, - .iotype = UPIO_MEM, - .membase = (void __iomem *)PNX833X_UART1_PORTS_START, - .mapbase = PNX833X_UART1_PORTS_START, - .irq = PNX833X_PIC_UART1_INT, - .uartclk = 3692300, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .line = 1, - }, - }, -}; - -static struct platform_device pnx833x_uart_device = { - .name = "pnx8xxx-uart", - .id = -1, - .dev = { - .dma_mask = &uart_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(32), - .platform_data = pnx8xxx_ports, - }, - .num_resources = ARRAY_SIZE(pnx833x_uart_resources), - .resource = pnx833x_uart_resources, -}; - -static u64 ehci_dmamask = DMA_BIT_MASK(32); - -static struct resource pnx833x_usb_ehci_resources[] = { - [0] = { - .start = PNX833X_USB_PORTS_START, - .end = PNX833X_USB_PORTS_END, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = PNX833X_PIC_USB_INT, - .end = PNX833X_PIC_USB_INT, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device pnx833x_usb_ehci_device = { - .name = "pnx833x-ehci", - .id = -1, - .dev = { - .dma_mask = &ehci_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(32), - }, - .num_resources = ARRAY_SIZE(pnx833x_usb_ehci_resources), - .resource = pnx833x_usb_ehci_resources, -}; - -static u64 ethernet_dmamask = DMA_BIT_MASK(32); - -static struct resource pnx833x_ethernet_resources[] = { - [0] = { - .start = PNX8335_IP3902_PORTS_START, - .end = PNX8335_IP3902_PORTS_END, - .flags = IORESOURCE_MEM, - }, -#ifdef CONFIG_SOC_PNX8335 - [1] = { - .start = PNX8335_PIC_ETHERNET_INT, - .end = PNX8335_PIC_ETHERNET_INT, - .flags = IORESOURCE_IRQ, - }, -#endif -}; - -static struct platform_device pnx833x_ethernet_device = { - .name = "ip3902-eth", - .id = -1, - .dev = { - .dma_mask = ðernet_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(32), - }, - .num_resources = ARRAY_SIZE(pnx833x_ethernet_resources), - .resource = pnx833x_ethernet_resources, -}; - -static struct resource pnx833x_sata_resources[] = { - [0] = { - .start = PNX8335_SATA_PORTS_START, - .end = PNX8335_SATA_PORTS_END, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = PNX8335_PIC_SATA_INT, - .end = PNX8335_PIC_SATA_INT, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device pnx833x_sata_device = { - .name = "pnx833x-sata", - .id = -1, - .num_resources = ARRAY_SIZE(pnx833x_sata_resources), - .resource = pnx833x_sata_resources, -}; - -static void -pnx833x_flash_nand_cmd_ctrl(struct nand_chip *this, int cmd, unsigned int ctrl) -{ - unsigned long nandaddr = (unsigned long)this->legacy.IO_ADDR_W; - - if (cmd == NAND_CMD_NONE) - return; - - if (ctrl & NAND_CLE) - writeb(cmd, (void __iomem *)(nandaddr + PNX8335_NAND_CLE_MASK)); - else - writeb(cmd, (void __iomem *)(nandaddr + PNX8335_NAND_ALE_MASK)); -} - -static struct platform_nand_data pnx833x_flash_nand_data = { - .chip = { - .nr_chips = 1, - .chip_delay = 25, - }, - .ctrl = { - .cmd_ctrl = pnx833x_flash_nand_cmd_ctrl - } -}; - -/* - * Set start to be the correct address (PNX8335_NAND_BASE with no 0xb!!), - * 12 bytes more seems to be the standard that allows for NAND access. - */ -static struct resource pnx833x_flash_nand_resource = { - .start = PNX8335_NAND_BASE, - .end = PNX8335_NAND_BASE + 12, - .flags = IORESOURCE_MEM, -}; - -static struct platform_device pnx833x_flash_nand = { - .name = "gen_nand", - .id = -1, - .num_resources = 1, - .resource = &pnx833x_flash_nand_resource, - .dev = { - .platform_data = &pnx833x_flash_nand_data, - }, -}; - -static struct platform_device *pnx833x_platform_devices[] __initdata = { - &pnx833x_uart_device, - &pnx833x_usb_ehci_device, - &pnx833x_ethernet_device, - &pnx833x_sata_device, - &pnx833x_flash_nand, -}; - -static int __init pnx833x_platform_init(void) -{ - return platform_add_devices(pnx833x_platform_devices, - ARRAY_SIZE(pnx833x_platform_devices)); -} - -arch_initcall(pnx833x_platform_init); diff --git a/arch/mips/pnx833x/common/prom.c b/arch/mips/pnx833x/common/prom.c deleted file mode 100644 index 12733ef25782..000000000000 --- a/arch/mips/pnx833x/common/prom.c +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * prom.c: - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - * - * Based on software written by: - * Nikita Youshchenko <yoush@debian.org>, based on PNX8550 code. - */ -#include <linux/init.h> -#include <asm/bootinfo.h> -#include <linux/string.h> - -void __init prom_init_cmdline(void) -{ - int argc = fw_arg0; - char **argv = (char **)fw_arg1; - char *c = &(arcs_cmdline[0]); - int i; - - for (i = 1; i < argc; i++) { - strcpy(c, argv[i]); - c += strlen(argv[i]); - if (i < argc-1) - *c++ = ' '; - } - *c = 0; -} - -char __init *prom_getenv(char *envname) -{ - extern char **prom_envp; - char **env = prom_envp; - int i; - - i = strlen(envname); - - while (*env) { - if (strncmp(envname, *env, i) == 0 && *(*env+i) == '=') - return *env + i + 1; - env++; - } - - return 0; -} - -void __init prom_free_prom_memory(void) -{ -} diff --git a/arch/mips/pnx833x/common/reset.c b/arch/mips/pnx833x/common/reset.c deleted file mode 100644 index b48e83bf912b..000000000000 --- a/arch/mips/pnx833x/common/reset.c +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * reset.c: reset support for PNX833X. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - * - * Based on software written by: - * Nikita Youshchenko <yoush@debian.org>, based on PNX8550 code. - */ -#include <linux/reboot.h> -#include <pnx833x.h> - -void pnx833x_machine_restart(char *command) -{ - PNX833X_RESET_CONTROL_2 = 0; - PNX833X_RESET_CONTROL = 0; -} - -void pnx833x_machine_halt(void) -{ - while (1) - __asm__ __volatile__ ("wait"); - -} - -void pnx833x_machine_power_off(void) -{ - pnx833x_machine_halt(); -} diff --git a/arch/mips/pnx833x/common/setup.c b/arch/mips/pnx833x/common/setup.c deleted file mode 100644 index abf68d92ce4a..000000000000 --- a/arch/mips/pnx833x/common/setup.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * setup.c: Setup PNX833X Soc. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - * - * Based on software written by: - * Nikita Youshchenko <yoush@debian.org>, based on PNX8550 code. - */ -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/ioport.h> -#include <linux/io.h> -#include <linux/pci.h> -#include <asm/reboot.h> -#include <pnx833x.h> -#include <gpio.h> - -extern void pnx833x_board_setup(void); -extern void pnx833x_machine_restart(char *); -extern void pnx833x_machine_halt(void); -extern void pnx833x_machine_power_off(void); - -int __init plat_mem_setup(void) -{ - /* set mips clock to 320MHz */ -#if defined(CONFIG_SOC_PNX8335) - PNX8335_WRITEFIELD(0x17, CLOCK_PLL_CPU_CTL, FREQ); -#endif - pnx833x_gpio_init(); /* so it will be ready in board_setup() */ - - pnx833x_board_setup(); - - _machine_restart = pnx833x_machine_restart; - _machine_halt = pnx833x_machine_halt; - pm_power_off = pnx833x_machine_power_off; - - /* IO/MEM resources. */ - set_io_port_base(KSEG1); - ioport_resource.start = 0; - ioport_resource.end = ~0; - iomem_resource.start = 0; - iomem_resource.end = ~0; - - return 0; -} diff --git a/arch/mips/pnx833x/stb22x/board.c b/arch/mips/pnx833x/stb22x/board.c deleted file mode 100644 index 93d8e7b73427..000000000000 --- a/arch/mips/pnx833x/stb22x/board.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * board.c: STB225 board support. - * - * Copyright 2008 NXP Semiconductors - * Chris Steel <chris.steel@nxp.com> - * Daniel Laird <daniel.j.laird@nxp.com> - * - * Based on software written by: - * Nikita Youshchenko <yoush@debian.org>, based on PNX8550 code. - */ -#include <linux/init.h> -#include <asm/bootinfo.h> -#include <linux/mm.h> -#include <pnx833x.h> -#include <gpio.h> - -/* endianess twiddlers */ -#define PNX8335_DEBUG0 0x4400 -#define PNX8335_DEBUG1 0x4404 -#define PNX8335_DEBUG2 0x4408 -#define PNX8335_DEBUG3 0x440c -#define PNX8335_DEBUG4 0x4410 -#define PNX8335_DEBUG5 0x4414 -#define PNX8335_DEBUG6 0x4418 -#define PNX8335_DEBUG7 0x441c - -int prom_argc; -char **prom_argv, **prom_envp; - -extern void prom_init_cmdline(void); -extern char *prom_getenv(char *envname); - -const char *get_system_type(void) -{ - return "NXP STB22x"; -} - -static inline unsigned long env_or_default(char *env, unsigned long dfl) -{ - char *str = prom_getenv(env); - return str ? simple_strtol(str, 0, 0) : dfl; -} - -void __init prom_init(void) -{ - unsigned long memsize; - - prom_argc = fw_arg0; - prom_argv = (char **)fw_arg1; - prom_envp = (char **)fw_arg2; - - prom_init_cmdline(); - - memsize = env_or_default("memsize", 0x02000000); - add_memory_region(0, memsize, BOOT_MEM_RAM); -} - -void __init pnx833x_board_setup(void) -{ - pnx833x_gpio_select_function_alt(4); - pnx833x_gpio_select_output(4); - pnx833x_gpio_select_function_alt(5); - pnx833x_gpio_select_input(5); - pnx833x_gpio_select_function_alt(6); - pnx833x_gpio_select_input(6); - pnx833x_gpio_select_function_alt(7); - pnx833x_gpio_select_output(7); - - pnx833x_gpio_select_function_alt(25); - pnx833x_gpio_select_function_alt(26); - - pnx833x_gpio_select_function_alt(27); - pnx833x_gpio_select_function_alt(28); - pnx833x_gpio_select_function_alt(29); - pnx833x_gpio_select_function_alt(30); - pnx833x_gpio_select_function_alt(31); - pnx833x_gpio_select_function_alt(32); - pnx833x_gpio_select_function_alt(33); - -#if IS_ENABLED(CONFIG_MTD_NAND_PLATFORM) - /* Setup MIU for NAND access on CS0... - * - * (it seems that we must also configure CS1 for reliable operation, - * otherwise the first read ID command will fail if it's read as 4 bytes - * but pass if it's read as 1 word.) - */ - - /* Setup MIU CS0 & CS1 timing */ - PNX833X_MIU_SEL0 = 0; - PNX833X_MIU_SEL1 = 0; - PNX833X_MIU_SEL0_TIMING = 0x50003081; - PNX833X_MIU_SEL1_TIMING = 0x50003081; - - /* Setup GPIO 00 for use as MIU CS1 (CS0 is not multiplexed, so does not need this) */ - pnx833x_gpio_select_function_alt(0); - - /* Setup GPIO 04 to input NAND read/busy signal */ - pnx833x_gpio_select_function_io(4); - pnx833x_gpio_select_input(4); - - /* Setup GPIO 05 to disable NAND write protect */ - pnx833x_gpio_select_function_io(5); - pnx833x_gpio_select_output(5); - pnx833x_gpio_write(1, 5); - -#elif IS_ENABLED(CONFIG_MTD_CFI) - - /* Set up MIU for 16-bit NOR access on CS0 and CS1... */ - - /* Setup MIU CS0 & CS1 timing */ - PNX833X_MIU_SEL0 = 1; - PNX833X_MIU_SEL1 = 1; - PNX833X_MIU_SEL0_TIMING = 0x6A08D082; - PNX833X_MIU_SEL1_TIMING = 0x6A08D082; - - /* Setup GPIO 00 for use as MIU CS1 (CS0 is not multiplexed, so does not need this) */ - pnx833x_gpio_select_function_alt(0); -#endif -} diff --git a/arch/mips/ralink/of.c b/arch/mips/ralink/of.c index 90c6d4a11c5d..cbae9d23ab7f 100644 --- a/arch/mips/ralink/of.c +++ b/arch/mips/ralink/of.c @@ -84,8 +84,7 @@ void __init plat_mem_setup(void) if (memory_dtb) of_scan_flat_dt(early_init_dt_scan_memory, NULL); else if (soc_info.mem_size) - add_memory_region(soc_info.mem_base, soc_info.mem_size * SZ_1M, - BOOT_MEM_RAM); + memblock_add(soc_info.mem_base, soc_info.mem_size * SZ_1M); else detect_memory_region(soc_info.mem_base, soc_info.mem_size_min * SZ_1M, diff --git a/arch/mips/rb532/prom.c b/arch/mips/rb532/prom.c index 303cc3dc1749..a9d1f2019dc3 100644 --- a/arch/mips/rb532/prom.c +++ b/arch/mips/rb532/prom.c @@ -126,5 +126,5 @@ void __init prom_init(void) /* give all RAM to boot allocator, * except for the first 0x400 and the last 0x200 bytes */ - add_memory_region(ddrbase + 0x400, memsize - 0x600, BOOT_MEM_RAM); + memblock_add(ddrbase + 0x400, memsize - 0x600); } diff --git a/arch/mips/sgi-ip30/ip30-common.h b/arch/mips/sgi-ip30/ip30-common.h index d2bcaee712f3..7b5db24b6279 100644 --- a/arch/mips/sgi-ip30/ip30-common.h +++ b/arch/mips/sgi-ip30/ip30-common.h @@ -3,6 +3,20 @@ #ifndef __IP30_COMMON_H #define __IP30_COMMON_H +/* + * Power Switch is wired via BaseIO BRIDGE slot #6. + * + * ACFail is wired via BaseIO BRIDGE slot #7. + */ +#define IP30_POWER_IRQ HEART_L2_INT_POWER_BTN + +#define IP30_HEART_L0_IRQ (MIPS_CPU_IRQ_BASE + 2) +#define IP30_HEART_L1_IRQ (MIPS_CPU_IRQ_BASE + 3) +#define IP30_HEART_L2_IRQ (MIPS_CPU_IRQ_BASE + 4) +#define IP30_HEART_TIMER_IRQ (MIPS_CPU_IRQ_BASE + 5) +#define IP30_HEART_ERR_IRQ (MIPS_CPU_IRQ_BASE + 6) + +extern void __init ip30_install_ipi(void); extern struct plat_smp_ops ip30_smp_ops; extern void __init ip30_per_cpu_init(void); diff --git a/arch/mips/sgi-ip30/ip30-irq.c b/arch/mips/sgi-ip30/ip30-irq.c index c2ffcb920250..e8374e4c705b 100644 --- a/arch/mips/sgi-ip30/ip30-irq.c +++ b/arch/mips/sgi-ip30/ip30-irq.c @@ -14,6 +14,8 @@ #include <asm/irq_cpu.h> #include <asm/sgi/heart.h> +#include "ip30-common.h" + struct heart_irq_data { u64 *irq_mask; int cpu; diff --git a/arch/mips/sgi-ip32/ip32-dma.c b/arch/mips/sgi-ip32/ip32-dma.c index fa7b17cb5385..20c6da9d76bc 100644 --- a/arch/mips/sgi-ip32/ip32-dma.c +++ b/arch/mips/sgi-ip32/ip32-dma.c @@ -18,7 +18,7 @@ #define RAM_OFFSET_MASK 0x3fffffffUL -dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { dma_addr_t dma_addr = paddr & RAM_OFFSET_MASK; @@ -27,7 +27,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) return dma_addr; } -phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr = dma_addr & RAM_OFFSET_MASK; diff --git a/arch/mips/sgi-ip32/ip32-memory.c b/arch/mips/sgi-ip32/ip32-memory.c index 62b956cc2d1d..0f53fed39da6 100644 --- a/arch/mips/sgi-ip32/ip32-memory.c +++ b/arch/mips/sgi-ip32/ip32-memory.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <asm/ip32/crime.h> @@ -36,7 +37,7 @@ void __init prom_meminit(void) printk("CRIME MC: bank %u base 0x%016Lx size %LuMiB\n", bank, base, size >> 20); - add_memory_region(base, size, BOOT_MEM_RAM); + memblock_add(base, size); } } diff --git a/arch/mips/sgi-ip32/ip32-setup.c b/arch/mips/sgi-ip32/ip32-setup.c index 3abd1465ec02..8019dae1721a 100644 --- a/arch/mips/sgi-ip32/ip32-setup.c +++ b/arch/mips/sgi-ip32/ip32-setup.c @@ -12,12 +12,10 @@ #include <linux/console.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> #include <linux/param.h> #include <linux/sched.h> #include <asm/bootinfo.h> -#include <asm/mc146818-time.h> #include <asm/mipsregs.h> #include <asm/mmu_context.h> #include <asm/sgialib.h> diff --git a/arch/mips/sibyte/common/cfe.c b/arch/mips/sibyte/common/cfe.c index cbf5939ed53a..89f7fca45152 100644 --- a/arch/mips/sibyte/common/cfe.c +++ b/arch/mips/sibyte/common/cfe.c @@ -114,16 +114,14 @@ static __init void prom_meminit(void) if (initrd_start) { if ((initrd_pstart > addr) && (initrd_pstart < (addr + size))) { - add_memory_region(addr, - initrd_pstart - addr, - BOOT_MEM_RAM); + memblock_add(addr, + initrd_pstart - addr); rd_flag = 1; } if ((initrd_pend > addr) && (initrd_pend < (addr + size))) { - add_memory_region(initrd_pend, - (addr + size) - initrd_pend, - BOOT_MEM_RAM); + memblock_add(initrd_pend, + (addr + size) - initrd_pend); rd_flag = 1; } } @@ -142,7 +140,7 @@ static __init void prom_meminit(void) */ if (size > 512) size -= 512; - add_memory_region(addr, size, BOOT_MEM_RAM); + memblock_add(addr, size); } board_mem_region_addrs[board_mem_region_count] = addr; board_mem_region_sizes[board_mem_region_count] = size; @@ -158,8 +156,8 @@ static __init void prom_meminit(void) } #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start) { - add_memory_region(initrd_pstart, initrd_pend - initrd_pstart, - BOOT_MEM_RESERVED); + memblock_add(initrd_pstart, initrd_pend - initrd_pstart); + memblock_reserve(initrd_pstart, initrd_pend - initrd_pstart); } #endif } diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c index 360c388f4c82..bf8a3cdababf 100644 --- a/arch/mips/txx9/generic/setup_tx4939.c +++ b/arch/mips/txx9/generic/setup_tx4939.c @@ -22,7 +22,6 @@ #include <linux/mtd/physmap.h> #include <linux/platform_device.h> #include <linux/platform_data/txx9/ndfmc.h> -#include <asm/bootinfo.h> #include <asm/reboot.h> #include <asm/traps.h> #include <asm/txx9irq.h> @@ -94,22 +93,6 @@ static struct resource tx4939_sdram_resource[4]; static struct resource tx4939_sram_resource; #define TX4939_SRAM_SIZE 0x800 -void __init tx4939_add_memory_regions(void) -{ - int i; - unsigned long start, size; - u64 win; - - for (i = 0; i < 4; i++) { - if (!((__u32)____raw_readq(&tx4939_ddrcptr->winen) & (1 << i))) - continue; - win = ____raw_readq(&tx4939_ddrcptr->win[i]); - start = (unsigned long)(win >> 48); - size = (((unsigned long)(win >> 32) & 0xffff) + 1) - start; - add_memory_region(start << 20, size << 20, BOOT_MEM_RAM); - } -} - void __init tx4939_setup(void) { int i; diff --git a/arch/mips/txx9/jmr3927/prom.c b/arch/mips/txx9/jmr3927/prom.c index 68a96473c134..53c68de54d30 100644 --- a/arch/mips/txx9/jmr3927/prom.c +++ b/arch/mips/txx9/jmr3927/prom.c @@ -37,7 +37,7 @@ */ #include <linux/init.h> #include <linux/kernel.h> -#include <asm/bootinfo.h> +#include <linux/memblock.h> #include <asm/txx9/generic.h> #include <asm/txx9/jmr3927.h> @@ -47,6 +47,6 @@ void __init jmr3927_prom_init(void) if ((tx3927_ccfgptr->ccfg & TX3927_CCFG_TLBOFF) == 0) pr_err("TX3927 TLB off\n"); - add_memory_region(0, JMR3927_SDRAM_SIZE, BOOT_MEM_RAM); + memblock_add(0, JMR3927_SDRAM_SIZE); txx9_sio_putchar_init(TX3927_SIO_REG(1)); } diff --git a/arch/mips/txx9/rbtx4927/prom.c b/arch/mips/txx9/rbtx4927/prom.c index fe6d0b54763f..9b4acff826eb 100644 --- a/arch/mips/txx9/rbtx4927/prom.c +++ b/arch/mips/txx9/rbtx4927/prom.c @@ -29,13 +29,14 @@ * with this program; if not, write to the Free Software Foundation, Inc., * 675 Mass Ave, Cambridge, MA 02139, USA. */ + #include <linux/init.h> -#include <asm/bootinfo.h> +#include <linux/memblock.h> #include <asm/txx9/generic.h> #include <asm/txx9/rbtx4927.h> void __init rbtx4927_prom_init(void) { - add_memory_region(0, tx4927_get_mem_size(), BOOT_MEM_RAM); + memblock_add(0, tx4927_get_mem_size()); txx9_sio_putchar_init(TX4927_SIO_REG(0) & 0xfffffffffULL); } diff --git a/arch/mips/txx9/rbtx4938/prom.c b/arch/mips/txx9/rbtx4938/prom.c index 2b36a2ee744c..0de84716a428 100644 --- a/arch/mips/txx9/rbtx4938/prom.c +++ b/arch/mips/txx9/rbtx4938/prom.c @@ -12,12 +12,11 @@ #include <linux/init.h> #include <linux/memblock.h> -#include <asm/bootinfo.h> #include <asm/txx9/generic.h> #include <asm/txx9/rbtx4938.h> void __init rbtx4938_prom_init(void) { - add_memory_region(0, tx4938_get_mem_size(), BOOT_MEM_RAM); + memblock_add(0, tx4938_get_mem_size()); txx9_sio_putchar_init(TX4938_SIO_REG(0) & 0xfffffffffULL); } diff --git a/arch/mips/txx9/rbtx4939/prom.c b/arch/mips/txx9/rbtx4939/prom.c index bd277ecb4ad6..ba25ba1bd2ec 100644 --- a/arch/mips/txx9/rbtx4939/prom.c +++ b/arch/mips/txx9/rbtx4939/prom.c @@ -7,11 +7,23 @@ */ #include <linux/init.h> +#include <linux/memblock.h> #include <asm/txx9/generic.h> #include <asm/txx9/rbtx4939.h> void __init rbtx4939_prom_init(void) { - tx4939_add_memory_regions(); + unsigned long start, size; + u64 win; + int i; + + for (i = 0; i < 4; i++) { + if (!((__u32)____raw_readq(&tx4939_ddrcptr->winen) & (1 << i))) + continue; + win = ____raw_readq(&tx4939_ddrcptr->win[i]); + start = (unsigned long)(win >> 48); + size = (((unsigned long)(win >> 32) & 0xffff) + 1) - start; + memblock_add(start << 20, size << 20); + } txx9_sio_putchar_init(TX4939_SIO_REG(0) & 0xfffffffffULL); } diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 57fe83235281..5810cc12bc1d 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -61,7 +61,7 @@ endif # VDSO linker flags. ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ $(filter -E%,$(KBUILD_CFLAGS)) -nostdlib -shared \ - -G 0 --eh-frame-hdr --hash-style=sysv --build-id -T + -G 0 --eh-frame-hdr --hash-style=sysv --build-id=sha1 -T CFLAGS_REMOVE_vdso.o = -pg diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index e30298e99e1b..e8e541fd2267 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -48,6 +48,7 @@ config NDS32 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE + select SET_FS help Andes(nds32) Linux support. diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c index 69d762182d49..2ac8e6c82a61 100644 --- a/arch/nds32/kernel/dma.c +++ b/arch/nds32/kernel/dma.c @@ -3,7 +3,7 @@ #include <linux/types.h> #include <linux/mm.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/cache.h> #include <linux/highmem.h> #include <asm/cacheflush.h> diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c index 36e25a410bb0..2acb94812af9 100644 --- a/arch/nds32/kernel/signal.c +++ b/arch/nds32/kernel/signal.c @@ -379,8 +379,6 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags) if (thread_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/nds32/kernel/vdso/Makefile b/arch/nds32/kernel/vdso/Makefile index 7c3c1ccb196e..55df25ef0057 100644 --- a/arch/nds32/kernel/vdso/Makefile +++ b/arch/nds32/kernel/vdso/Makefile @@ -20,7 +20,7 @@ GCOV_PROFILE := n obj-y += vdso.o -extra-y += vdso.lds +targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Force dependency diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index c6645141bb2a..c7c6ba6bec9d 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -27,6 +27,7 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE if MMU + select SET_FS config GENERIC_CSUM def_bool y diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 88a4ec03edab..4ffe857e6ada 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -266,5 +266,5 @@ asmlinkage int nios2_clone(unsigned long clone_flags, unsigned long newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c index d8a087cf2b42..cf2dca2ac7c3 100644 --- a/arch/nios2/kernel/signal.c +++ b/arch/nios2/kernel/signal.c @@ -317,7 +317,7 @@ asmlinkage int do_notify_resume(struct pt_regs *regs) */ return restart; } - } else if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + } else if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); return 0; diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 7e94fe37cb2f..6233c6293180 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -39,6 +39,7 @@ config OPENRISC select ARCH_WANT_FRAME_POINTERS select GENERIC_IRQ_MULTI_HANDLER select MMU_GATHER_NO_RANGE if MMU + select SET_FS config CPU_BIG_ENDIAN def_bool y diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 345727638d52..1b16d97e7da7 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -13,7 +13,7 @@ * DMA mapping callbacks... */ -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/pagewalk.h> #include <asm/cpuinfo.h> diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index c779364f0cd0..af66f968dd45 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -311,7 +311,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) } syscall = 0; } else { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index cd4afe1e7a6c..b234e8154cbd 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -63,6 +63,7 @@ config PARISC select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE select HAVE_KPROBES_ON_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select SET_FS help The PA-RISC microprocessor is designed by Hewlett-Packard and used @@ -195,7 +196,6 @@ config PA11 depends on PA7000 || PA7100LC || PA7200 || PA7300LC select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select DMA_NONCOHERENT_CACHE_SYNC config PREFETCH def_bool y @@ -376,5 +376,6 @@ config KEXEC_FILE endmenu +source "drivers/firmware/Kconfig" source "drivers/parisc/Kconfig" diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index 61bac8ff8f22..3cbcfad5f724 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -52,10 +52,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=6144 -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_NS87415=y CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_ST=y CONFIG_BLK_DEV_SR=y @@ -65,6 +61,8 @@ CONFIG_SCSI_SYM53C8XX_2=y CONFIG_SCSI_ZALON=y CONFIG_SCSI_DH=y CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_NS87415=y CONFIG_MD=y CONFIG_BLK_DEV_MD=m CONFIG_MD_LINEAR=m diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 59561e04e659..7e2d7026285e 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -58,11 +58,6 @@ CONFIG_PCI_IOV=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_BLK_DEV_LOOP=y -CONFIG_IDE=y -CONFIG_IDE_GD=m -CONFIG_IDE_GD_ATAPI=y -CONFIG_BLK_DEV_IDECD=m -CONFIG_BLK_DEV_NS87415=y # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y @@ -76,6 +71,7 @@ CONFIG_SCSI_ZALON=y CONFIG_SCSI_QLA_ISCSI=m CONFIG_SCSI_DH=y CONFIG_ATA=y +CONFIG_PATA_NS87415=y CONFIG_PATA_SIL680=y CONFIG_ATA_GENERIC=y CONFIG_MD=y diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h index 640d46edf32e..c705decf2bed 100644 --- a/arch/parisc/include/asm/barrier.h +++ b/arch/parisc/include/asm/barrier.h @@ -2,11 +2,15 @@ #ifndef __ASM_BARRIER_H #define __ASM_BARRIER_H +#include <asm/alternative.h> + #ifndef __ASSEMBLY__ /* The synchronize caches instruction executes as a nop on systems in which all memory references are performed in order. */ -#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory") +#define synchronize_caches() asm volatile("sync" \ + ALTERNATIVE(ALT_COND_NO_SMP, INSN_NOP) \ + : : : "memory") #if defined(CONFIG_SMP) #define mb() do { synchronize_caches(); } while (0) diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h index 068958575871..cf5ee9b0b393 100644 --- a/arch/parisc/include/asm/cmpxchg.h +++ b/arch/parisc/include/asm/cmpxchg.h @@ -14,22 +14,22 @@ extern void __xchg_called_with_bad_pointer(void); /* __xchg32/64 defined in arch/parisc/lib/bitops.c */ -extern unsigned long __xchg8(char, char *); -extern unsigned long __xchg32(int, int *); +extern unsigned long __xchg8(char, volatile char *); +extern unsigned long __xchg32(int, volatile int *); #ifdef CONFIG_64BIT -extern unsigned long __xchg64(unsigned long, unsigned long *); +extern unsigned long __xchg64(unsigned long, volatile unsigned long *); #endif /* optimizer better get rid of switch since size is a constant */ static inline unsigned long -__xchg(unsigned long x, __volatile__ void *ptr, int size) +__xchg(unsigned long x, volatile void *ptr, int size) { switch (size) { #ifdef CONFIG_64BIT - case 8: return __xchg64(x, (unsigned long *) ptr); + case 8: return __xchg64(x, (volatile unsigned long *) ptr); #endif - case 4: return __xchg32((int) x, (int *) ptr); - case 1: return __xchg8((char) x, (char *) ptr); + case 4: return __xchg32((int) x, (volatile int *) ptr); + case 1: return __xchg8((char) x, (volatile char *) ptr); } __xchg_called_with_bad_pointer(); return x; diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index c459f656c8c3..fceb9cf02fb3 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -16,7 +16,7 @@ static inline void _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags) { extern u32 lws_lock_start[]; - long index = ((long)uaddr & 0xf0) >> 2; + long index = ((long)uaddr & 0x3f8) >> 1; arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index]; local_irq_save(*flags); arch_spin_lock(s); @@ -26,7 +26,7 @@ static inline void _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags) { extern u32 lws_lock_start[]; - long index = ((long)uaddr & 0xf0) >> 2; + long index = ((long)uaddr & 0x3f8) >> 1; arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index]; arch_spin_unlock(s); local_irq_restore(*flags); diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h index 79feff1b0721..33500c9f6e5e 100644 --- a/arch/parisc/include/asm/socket.h +++ b/arch/parisc/include/asm/socket.h @@ -4,8 +4,8 @@ #include <uapi/asm/socket.h> -/* O_NONBLOCK clashes with the bits used for socket types. Therefore we - * have to define SOCK_NONBLOCK to a different value here. +/* O_NONBLOCK clashed with the bits used for socket types. Therefore we + * had to define SOCK_NONBLOCK to a different value here. */ #define SOCK_NONBLOCK 0x40000000 diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 51b6c47f802f..fa5ee8a45dbd 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -10,13 +10,21 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) { volatile unsigned int *a = __ldcw_align(x); - return *a == 0; + return READ_ONCE(*a) == 0; } -#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) +static inline void arch_spin_lock(arch_spinlock_t *x) +{ + volatile unsigned int *a; + + a = __ldcw_align(x); + while (__ldcw(a) == 0) + while (*a == 0) + continue; +} static inline void arch_spin_lock_flags(arch_spinlock_t *x, - unsigned long flags) + unsigned long flags) { volatile unsigned int *a; @@ -25,10 +33,8 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *x, while (*a == 0) if (flags & PSW_SM_I) { local_irq_enable(); - cpu_relax(); local_irq_disable(); - } else - cpu_relax(); + } } #define arch_spin_lock_flags arch_spin_lock_flags @@ -44,12 +50,9 @@ static inline void arch_spin_unlock(arch_spinlock_t *x) static inline int arch_spin_trylock(arch_spinlock_t *x) { volatile unsigned int *a; - int ret; a = __ldcw_align(x); - ret = __ldcw(a) != 0; - - return ret; + return __ldcw(a) != 0; } /* diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h index 03ce20e5ad7d..03dee816cb13 100644 --- a/arch/parisc/include/uapi/asm/fcntl.h +++ b/arch/parisc/include/uapi/asm/fcntl.h @@ -3,22 +3,19 @@ #define _PARISC_FCNTL_H #define O_APPEND 000000010 -#define O_BLKSEEK 000000100 /* HPUX only */ #define O_CREAT 000000400 /* not fcntl */ #define O_EXCL 000002000 /* not fcntl */ #define O_LARGEFILE 000004000 #define __O_SYNC 000100000 #define O_SYNC (__O_SYNC|O_DSYNC) -#define O_NONBLOCK 000200004 /* HPUX has separate NDELAY & NONBLOCK */ +#define O_NONBLOCK 000200000 #define O_NOCTTY 000400000 /* not fcntl */ -#define O_DSYNC 001000000 /* HPUX only */ -#define O_RSYNC 002000000 /* HPUX only */ +#define O_DSYNC 001000000 #define O_NOATIME 004000000 #define O_CLOEXEC 010000000 /* set close_on_exec */ #define O_DIRECTORY 000010000 /* must be a directory */ #define O_NOFOLLOW 000000200 /* don't follow links */ -#define O_INVISIBLE 004000000 /* invisible I/O, for DMAPI/XDSM */ #define O_PATH 020000000 #define __O_TMPFILE 040000000 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 6fd8871e4081..ab78cba446ed 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -25,6 +25,7 @@ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#define MAP_UNINITIALIZED 0 /* uninitialized anonymous mmap */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/parisc/include/uapi/asm/signal.h b/arch/parisc/include/uapi/asm/signal.h index d38563a394f2..e605197b462c 100644 --- a/arch/parisc/include/uapi/asm/signal.h +++ b/arch/parisc/include/uapi/asm/signal.h @@ -35,11 +35,11 @@ #define SIGURG 29 #define SIGXFSZ 30 #define SIGUNUSED 31 -#define SIGSYS 31 /* Linux doesn't use this */ +#define SIGSYS 31 /* These should not be considered constants from userland. */ #define SIGRTMIN 32 -#define SIGRTMAX _NSIG /* it's 44 under HP/UX */ +#define SIGRTMAX _NSIG /* * SA_FLAGS values: @@ -61,7 +61,6 @@ #define SA_NODEFER 0x00000020 #define SA_RESTART 0x00000040 #define SA_NOCLDWAIT 0x00000080 -#define _SA_SIGGFAULT 0x00000100 /* HPUX */ #define SA_NOMASK SA_NODEFER #define SA_ONESHOT SA_RESETHAND diff --git a/arch/parisc/install.sh b/arch/parisc/install.sh index 6f68784fea25..056d588befdd 100644 --- a/arch/parisc/install.sh +++ b/arch/parisc/install.sh @@ -43,7 +43,7 @@ fi # Default install -if [ "$(basename $2)" = "zImage" ]; then +if [ "$(basename $2)" = "vmlinuz" ]; then # Compressed install echo "Installing compressed kernel" base=vmlinuz diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index b5e1d9f1b440..86a1a63563fd 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -383,12 +383,12 @@ EXPORT_SYMBOL(flush_kernel_icache_range_asm); static unsigned long parisc_cache_flush_threshold __ro_after_init = FLUSH_THRESHOLD; #define FLUSH_TLB_THRESHOLD (16*1024) /* 16 KiB minimum TLB threshold */ -static unsigned long parisc_tlb_flush_threshold __ro_after_init = FLUSH_TLB_THRESHOLD; +static unsigned long parisc_tlb_flush_threshold __ro_after_init = ~0UL; void __init parisc_setup_cache_timing(void) { unsigned long rangetime, alltime; - unsigned long size, start; + unsigned long size; unsigned long threshold; alltime = mfctl(16); @@ -422,14 +422,9 @@ void __init parisc_setup_cache_timing(void) goto set_tlb_threshold; } - size = 0; - start = (unsigned long) _text; + size = (unsigned long)_end - (unsigned long)_text; rangetime = mfctl(16); - while (start < (unsigned long) _end) { - flush_tlb_kernel_range(start, start + PAGE_SIZE); - start += PAGE_SIZE; - size += PAGE_SIZE; - } + flush_tlb_kernel_range((unsigned long)_text, (unsigned long)_end); rangetime = mfctl(16) - rangetime; alltime = mfctl(16); @@ -444,8 +439,11 @@ void __init parisc_setup_cache_timing(void) threshold/1024); set_tlb_threshold: - if (threshold > parisc_tlb_flush_threshold) + if (threshold > FLUSH_TLB_THRESHOLD) parisc_tlb_flush_threshold = threshold; + else + parisc_tlb_flush_threshold = FLUSH_TLB_THRESHOLD; + printk(KERN_INFO "TLB flush threshold set to %lu KiB\n", parisc_tlb_flush_threshold/1024); } diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index a5f3e50fe976..80fa0650736b 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -30,6 +30,7 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/export.h> +#include <linux/dma-map-ops.h> #include <asm/hardware.h> #include <asm/io.h> #include <asm/pdc.h> diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 519f9056fd00..f6f28e41bb5e 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -899,20 +899,20 @@ intr_check_sig: * Only do signals if we are returning to user space */ LDREG PT_IASQ0(%r16), %r20 - cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* backward */ + cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* forward */ LDREG PT_IASQ1(%r16), %r20 - cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* backward */ - - /* NOTE: We need to enable interrupts if we have to deliver - * signals. We used to do this earlier but it caused kernel - * stack overflows. */ - ssm PSW_SM_I, %r0 + cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* forward */ copy %r0, %r25 /* long in_syscall = 0 */ #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ #endif + /* NOTE: We need to enable interrupts if we have to deliver + * signals. We used to do this earlier but it caused kernel + * stack overflows. */ + ssm PSW_SM_I, %r0 + BL do_notify_resume,%r2 copy %r16, %r26 /* struct pt_regs *regs */ diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c index 9298f2285510..7ab2f2a54400 100644 --- a/arch/parisc/kernel/inventory.c +++ b/arch/parisc/kernel/inventory.c @@ -19,6 +19,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/platform_device.h> #include <asm/hardware.h> #include <asm/io.h> #include <asm/mmzone.h> @@ -641,4 +642,33 @@ void __init do_device_inventory(void) if (pa_serialize_tlb_flushes) pr_info("Merced bus found: Enable PxTLB serialization.\n"); #endif + +#if defined(CONFIG_FW_CFG_SYSFS) + if (running_on_qemu) { + struct resource res[3] = {0,}; + unsigned int base; + + base = ((unsigned long long) PAGE0->pad0[2] << 32) + | PAGE0->pad0[3]; /* SeaBIOS stored it here */ + + res[0].name = "fw_cfg"; + res[0].start = base; + res[0].end = base + 8 - 1; + res[0].flags = IORESOURCE_MEM; + + res[1].name = "ctrl"; + res[1].start = 0; + res[1].flags = IORESOURCE_REG; + + res[2].name = "data"; + res[2].start = 4; + res[2].flags = IORESOURCE_REG; + + if (base) { + pr_info("Found qemu fw_cfg interface at %#08x\n", base); + platform_device_register_simple("fw_cfg", + PLATFORM_DEVID_NONE, res, 3); + } + } +#endif } diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 38c68e131bbe..36610a5c029f 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -26,7 +26,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <asm/cacheflush.h> #include <asm/dma.h> /* for DMA_CHUNK_SIZE */ @@ -454,9 +454,3 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, { flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size); } - -void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction direction) -{ - flush_kernel_dcache_range((unsigned long)vaddr, size); -} diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 3c037fc96038..9f43eaeb0b0a 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -606,8 +606,6 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs, in_syscall); - if (test_thread_flag(TIF_NOTIFY_RESUME)) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - } } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 6271139d2213..10227f667c8a 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -173,9 +173,12 @@ ipi_interrupt(int irq, void *dev_id) this_cpu, which); return IRQ_NONE; } /* Switch */ - /* let in any pending interrupts */ - local_irq_enable(); - local_irq_disable(); + + /* before doing more, let in any pending interrupts */ + if (ops) { + local_irq_enable(); + local_irq_disable(); + } } /* while (ops) */ } return IRQ_HANDLED; diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 3ad61a177f5b..322503780db6 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -571,8 +571,8 @@ lws_compare_and_swap: ldil L%lws_lock_start, %r20 ldo R%lws_lock_start(%r20), %r28 - /* Extract four bits from r26 and hash lock (Bits 4-7) */ - extru %r26, 27, 4, %r20 + /* Extract eight bits from r26 and hash lock (Bits 3-11) */ + extru %r26, 28, 8, %r20 /* Find lock to use, the hash is either one of 0 to 15, multiplied by 16 (keep it 16-byte aligned) @@ -761,8 +761,8 @@ cas2_lock_start: ldil L%lws_lock_start, %r20 ldo R%lws_lock_start(%r20), %r28 - /* Extract four bits from r26 and hash lock (Bits 4-7) */ - extru %r26, 27, 4, %r20 + /* Extract eight bits from r26 and hash lock (Bits 3-11) */ + extru %r26, 28, 8, %r20 /* Find lock to use, the hash is either one of 0 to 15, multiplied by 16 (keep it 16-byte aligned) @@ -950,7 +950,7 @@ END(sys_call_table64) .align L1_CACHE_BYTES ENTRY(lws_lock_start) /* lws locks */ - .rept 16 + .rept 256 /* Keep locks aligned at 16-bytes */ .word 1 .word 0 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index ae3dab371f6f..38c63e5404bc 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c index 2e4d1f05a926..9ac683bf6ae7 100644 --- a/arch/parisc/lib/bitops.c +++ b/arch/parisc/lib/bitops.c @@ -18,7 +18,7 @@ arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = { #endif #ifdef CONFIG_64BIT -unsigned long __xchg64(unsigned long x, unsigned long *ptr) +unsigned long __xchg64(unsigned long x, volatile unsigned long *ptr) { unsigned long temp, flags; @@ -30,7 +30,7 @@ unsigned long __xchg64(unsigned long x, unsigned long *ptr) } #endif -unsigned long __xchg32(int x, int *ptr) +unsigned long __xchg32(int x, volatile int *ptr) { unsigned long flags; long temp; @@ -43,7 +43,7 @@ unsigned long __xchg32(int x, int *ptr) } -unsigned long __xchg8(char x, char *ptr) +unsigned long __xchg8(char x, volatile char *ptr) { unsigned long flags; long temp; diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c index ce400417d54e..f03adb1999e7 100644 --- a/arch/parisc/lib/iomap.c +++ b/arch/parisc/lib/iomap.c @@ -346,6 +346,16 @@ u64 ioread64be(const void __iomem *addr) return *((u64 *)addr); } +u64 ioread64_hi_lo(const void __iomem *addr) +{ + u32 low, high; + + high = ioread32(addr + sizeof(u32)); + low = ioread32(addr); + + return low + ((u64)high << 32); +} + void iowrite8(u8 datum, void __iomem *addr) { if (unlikely(INDIRECT_ADDR(addr))) { @@ -409,6 +419,12 @@ void iowrite64be(u64 datum, void __iomem *addr) } } +void iowrite64_hi_lo(u64 val, void __iomem *addr) +{ + iowrite32(val >> 32, addr + sizeof(u32)); + iowrite32(val, addr); +} + /* Repeating interfaces */ void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count) @@ -511,6 +527,7 @@ EXPORT_SYMBOL(ioread32); EXPORT_SYMBOL(ioread32be); EXPORT_SYMBOL(ioread64); EXPORT_SYMBOL(ioread64be); +EXPORT_SYMBOL(ioread64_hi_lo); EXPORT_SYMBOL(iowrite8); EXPORT_SYMBOL(iowrite16); EXPORT_SYMBOL(iowrite16be); @@ -518,6 +535,7 @@ EXPORT_SYMBOL(iowrite32); EXPORT_SYMBOL(iowrite32be); EXPORT_SYMBOL(iowrite64); EXPORT_SYMBOL(iowrite64be); +EXPORT_SYMBOL(iowrite64_hi_lo); EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); EXPORT_SYMBOL(ioread32_rep); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1f0bd7e223f5..e9f13fe08492 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -59,7 +59,10 @@ config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool PPC64 + def_bool y if PPC64 + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y if PPC64 config NR_IRQS int "Number of virtual interrupt numbers" @@ -148,6 +151,7 @@ config PPC select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF select BUILDTIME_TABLE_SORT @@ -964,7 +968,7 @@ config PPC_MEM_KEYS config PPC_SECURE_BOOT prompt "Enable secure boot support" bool - depends on PPC_POWERNV + depends on PPC_POWERNV || PPC_PSERIES depends on IMA_ARCH_POLICY imply IMA_SECURE_AND_OR_TRUSTED_BOOT help @@ -984,6 +988,19 @@ config PPC_SECVAR_SYSFS read/write operations on these variables. Say Y if you have secure boot enabled and want to expose variables to userspace. +config PPC_RTAS_FILTER + bool "Enable filtering of RTAS syscalls" + default y + depends on PPC_RTAS + help + The RTAS syscall API has security issues that could be used to + compromise system integrity. This option enforces restrictions on the + RTAS calls and arguments passed by userspace programs to mitigate + these issues. + + Say Y unless you know what you are doing and the filter is causing + problems for you. + endmenu config ISA_DMA_API diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 3e8da9cf2eb9..a4d56f0a41d9 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -65,7 +65,6 @@ UTS_MACHINE := $(subst $(space),,$(machine-y)) ifdef CONFIG_PPC32 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o else -KBUILD_LDS_MODULE += $(srctree)/arch/powerpc/kernel/module.lds ifeq ($(call ld-ifversion, -ge, 225000000, y),y) # Have the linker provide sfpr if possible. # There is a corresponding test in arch/powerpc/lib/Makefile @@ -264,7 +263,8 @@ KBUILD_CFLAGS += $(cpu-as-y) KBUILD_AFLAGS += $(aflags-y) KBUILD_CFLAGS += $(cflags-y) -head-y := arch/powerpc/kernel/head_$(BITS).o +head-$(CONFIG_PPC64) := arch/powerpc/kernel/head_64.o +head-$(CONFIG_PPC_BOOK3S_32) := arch/powerpc/kernel/head_book3s_32.o head-$(CONFIG_PPC_8xx) := arch/powerpc/kernel/head_8xx.o head-$(CONFIG_40x) := arch/powerpc/kernel/head_40x.o head-$(CONFIG_44x) := arch/powerpc/kernel/head_44x.o diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index 2268396ff4bb..a6c77f4d32b2 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -18,7 +18,7 @@ quiet_cmd_relocs_check = CHKREL $@ ifdef CONFIG_PPC_BOOK3S_64 cmd_relocs_check = \ $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" ; \ - $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$@" + $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$(NM)" "$@" else cmd_relocs_check = \ $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index b88fd27a45f0..f8ce6d2dde7b 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -7,7 +7,7 @@ # Based on coffboot by Paul Mackerras # Simplified for ppc64 by Todd Inglett # -# NOTE: this code is built for 32 bit in ELF32 format even though +# NOTE: this code may be built for 32 bit in ELF32 format even though # it packages a 64 bit kernel. We do this to simplify the # bootloader and increase compatibility with OpenFirmware. # diff --git a/arch/powerpc/boot/dts/fsl/t1024rdb.dts b/arch/powerpc/boot/dts/fsl/t1024rdb.dts index 73a645324bc1..dbcd31cc35dc 100644 --- a/arch/powerpc/boot/dts/fsl/t1024rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1024rdb.dts @@ -161,7 +161,6 @@ rtc@68 { compatible = "dallas,ds1339"; reg = <0x68>; - interrupts = <0x1 0x1 0 0>; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t1040rdb.dts b/arch/powerpc/boot/dts/fsl/t1040rdb.dts index 65ff34c49025..af0c8a6f5613 100644 --- a/arch/powerpc/boot/dts/fsl/t1040rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1040rdb.dts @@ -64,6 +64,40 @@ phy_sgmii_2: ethernet-phy@3 { reg = <0x03>; }; + + /* VSC8514 QSGMII PHY */ + phy_qsgmii_0: ethernet-phy@4 { + reg = <0x4>; + }; + + phy_qsgmii_1: ethernet-phy@5 { + reg = <0x5>; + }; + + phy_qsgmii_2: ethernet-phy@6 { + reg = <0x6>; + }; + + phy_qsgmii_3: ethernet-phy@7 { + reg = <0x7>; + }; + + /* VSC8514 QSGMII PHY */ + phy_qsgmii_4: ethernet-phy@8 { + reg = <0x8>; + }; + + phy_qsgmii_5: ethernet-phy@9 { + reg = <0x9>; + }; + + phy_qsgmii_6: ethernet-phy@a { + reg = <0xa>; + }; + + phy_qsgmii_7: ethernet-phy@b { + reg = <0xb>; + }; }; }; }; @@ -76,3 +110,76 @@ }; #include "t1040si-post.dtsi" + +&seville_switch { + status = "okay"; +}; + +&seville_port0 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_0>; + phy-mode = "qsgmii"; + label = "ETH5"; + status = "okay"; +}; + +&seville_port1 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_1>; + phy-mode = "qsgmii"; + label = "ETH4"; + status = "okay"; +}; + +&seville_port2 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_2>; + phy-mode = "qsgmii"; + label = "ETH7"; + status = "okay"; +}; + +&seville_port3 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_3>; + phy-mode = "qsgmii"; + label = "ETH6"; + status = "okay"; +}; + +&seville_port4 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_4>; + phy-mode = "qsgmii"; + label = "ETH9"; + status = "okay"; +}; + +&seville_port5 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_5>; + phy-mode = "qsgmii"; + label = "ETH8"; + status = "okay"; +}; + +&seville_port6 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_6>; + phy-mode = "qsgmii"; + label = "ETH11"; + status = "okay"; +}; + +&seville_port7 { + managed = "in-band-status"; + phy-handle = <&phy_qsgmii_7>; + phy-mode = "qsgmii"; + label = "ETH10"; + status = "okay"; +}; + +&seville_port8 { + ethernet = <&enet0>; + status = "okay"; +}; diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index 315d0557eefc..f58eb820eb5e 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -628,6 +628,84 @@ status = "disabled"; }; }; + + seville_switch: ethernet-switch@800000 { + compatible = "mscc,vsc9953-switch"; + reg = <0x800000 0x290000>; + interrupts = <26 2 0 0>; + interrupt-names = "xtr"; + little-endian; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + seville_port0: port@0 { + reg = <0>; + status = "disabled"; + }; + + seville_port1: port@1 { + reg = <1>; + status = "disabled"; + }; + + seville_port2: port@2 { + reg = <2>; + status = "disabled"; + }; + + seville_port3: port@3 { + reg = <3>; + status = "disabled"; + }; + + seville_port4: port@4 { + reg = <4>; + status = "disabled"; + }; + + seville_port5: port@5 { + reg = <5>; + status = "disabled"; + }; + + seville_port6: port@6 { + reg = <6>; + status = "disabled"; + }; + + seville_port7: port@7 { + reg = <7>; + status = "disabled"; + }; + + seville_port8: port@8 { + reg = <8>; + phy-mode = "internal"; + status = "disabled"; + + fixed-link { + speed = <2500>; + full-duplex; + }; + }; + + seville_port9: port@9 { + reg = <9>; + phy-mode = "internal"; + status = "disabled"; + + fixed-link { + speed = <2500>; + full-duplex; + }; + }; + }; + }; }; &qe { diff --git a/arch/powerpc/boot/dts/fsl/t4240rdb.dts b/arch/powerpc/boot/dts/fsl/t4240rdb.dts index a56a705d41f7..145896f2eef6 100644 --- a/arch/powerpc/boot/dts/fsl/t4240rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t4240rdb.dts @@ -144,7 +144,6 @@ rtc@68 { compatible = "dallas,ds1374"; reg = <0x68>; - interrupts = <0x1 0x1 0 0>; }; }; diff --git a/arch/powerpc/boot/util.S b/arch/powerpc/boot/util.S index f11f0589a669..d03cdb7606dc 100644 --- a/arch/powerpc/boot/util.S +++ b/arch/powerpc/boot/util.S @@ -18,7 +18,7 @@ .text -/* udelay (on non-601 processors) needs to know the period of the +/* udelay needs to know the period of the * timebase in nanoseconds. This used to be hardcoded to be 60ns * (period of 66MHz/4). Now a variable is used that is initialized to * 60 for backward compatibility, but it can be overridden as necessary @@ -37,19 +37,6 @@ timebase_period_ns: */ .globl udelay udelay: - mfspr r4,SPRN_PVR - srwi r4,r4,16 - cmpwi 0,r4,1 /* 601 ? */ - bne .Ludelay_not_601 -00: li r0,86 /* Instructions / microsecond? */ - mtctr r0 -10: addi r0,r0,0 /* NOP */ - bdnz 10b - subic. r3,r3,1 - bne 00b - blr - -.Ludelay_not_601: mulli r4,r3,1000 /* nanoseconds */ /* Change r4 to be the number of ticks using: * (nanoseconds + (timebase_period_ns - 1 )) / timebase_period_ns diff --git a/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig b/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig index 0683d8c292a8..cea72e85ed26 100644 --- a/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig +++ b/arch/powerpc/configs/85xx/mpc85xx_cds_defconfig @@ -29,9 +29,9 @@ CONFIG_SYN_COOKIES=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_IDE=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_VIA=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E1000=y diff --git a/arch/powerpc/configs/85xx/tqm8540_defconfig b/arch/powerpc/configs/85xx/tqm8540_defconfig index 98982a0e82d8..bbf040aa1f9a 100644 --- a/arch/powerpc/configs/85xx/tqm8540_defconfig +++ b/arch/powerpc/configs/85xx/tqm8540_defconfig @@ -30,9 +30,9 @@ CONFIG_MTD_CFI_AMDSTD=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_IDE=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_VIA=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y diff --git a/arch/powerpc/configs/85xx/tqm8541_defconfig b/arch/powerpc/configs/85xx/tqm8541_defconfig index a6e21db1dafe..523ad8dcfd9d 100644 --- a/arch/powerpc/configs/85xx/tqm8541_defconfig +++ b/arch/powerpc/configs/85xx/tqm8541_defconfig @@ -30,9 +30,9 @@ CONFIG_MTD_CFI_AMDSTD=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_IDE=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_VIA=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y diff --git a/arch/powerpc/configs/85xx/tqm8555_defconfig b/arch/powerpc/configs/85xx/tqm8555_defconfig index ca1de3979474..0032ce1e8c9c 100644 --- a/arch/powerpc/configs/85xx/tqm8555_defconfig +++ b/arch/powerpc/configs/85xx/tqm8555_defconfig @@ -30,9 +30,9 @@ CONFIG_MTD_CFI_AMDSTD=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_IDE=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_VIA=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y diff --git a/arch/powerpc/configs/85xx/tqm8560_defconfig b/arch/powerpc/configs/85xx/tqm8560_defconfig index ca3b8c8ef30f..a80b971f7d6e 100644 --- a/arch/powerpc/configs/85xx/tqm8560_defconfig +++ b/arch/powerpc/configs/85xx/tqm8560_defconfig @@ -30,9 +30,9 @@ CONFIG_MTD_CFI_AMDSTD=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_IDE=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_VIA=y CONFIG_NETDEVICES=y CONFIG_GIANFAR=y CONFIG_E100=y diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index de14b1a34d56..d0b832cbbec8 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -67,6 +67,7 @@ void single_step_exception(struct pt_regs *regs); void program_check_exception(struct pt_regs *regs); void alignment_exception(struct pt_regs *regs); void StackOverflow(struct pt_regs *regs); +void stack_overflow_exception(struct pt_regs *regs); void kernel_fp_unavailable_exception(struct pt_regs *regs); void altivec_unavailable_exception(struct pt_regs *regs); void vsx_unavailable_exception(struct pt_regs *regs); @@ -144,7 +145,9 @@ void _kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr); /* Patch sites */ -extern s32 patch__call_flush_branch_caches; +extern s32 patch__call_flush_branch_caches1; +extern s32 patch__call_flush_branch_caches2; +extern s32 patch__call_flush_branch_caches3; extern s32 patch__flush_count_cache_return; extern s32 patch__flush_link_stack_return; extern s32 patch__call_kvm_flush_link_stack; diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 082b98808701..b6ac4f86c87b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -13,20 +13,24 @@ */ #define MAX_EA_BITS_PER_CONTEXT 46 -#define REGION_SHIFT (MAX_EA_BITS_PER_CONTEXT - 2) /* - * Our page table limit us to 64TB. Hence for the kernel mapping, - * each MAP area is limited to 16 TB. - * The four map areas are: linear mapping, vmap, IO and vmemmap + * Our page table limit us to 64TB. For 64TB physical memory, we only need 64GB + * of vmemmap space. To better support sparse memory layout, we use 61TB + * linear map range, 1TB of vmalloc, 1TB of I/O and 1TB of vmememmap. */ +#define REGION_SHIFT (40) #define H_KERN_MAP_SIZE (ASM_CONST(1) << REGION_SHIFT) /* - * Define the address range of the kernel non-linear virtual area - * 16TB + * Limits the linear mapping range */ -#define H_KERN_VIRT_START ASM_CONST(0xc000100000000000) +#define H_MAX_PHYSMEM_BITS 46 + +/* + * Define the address range of the kernel non-linear virtual area (61TB) + */ +#define H_KERN_VIRT_START ASM_CONST(0xc0003d0000000000) #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index f20de1149ebe..338e62fbea0b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -7,6 +7,19 @@ #define H_PUD_INDEX_SIZE 10 // size: 8B << 10 = 8KB, maps 2^10 x 16GB = 16TB #define H_PGD_INDEX_SIZE 8 // size: 8B << 8 = 2KB, maps 2^8 x 16TB = 4PB +/* + * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS + * if we increase SECTIONS_WIDTH we will not store node details in page->flags and + * page_to_nid does a page->section->node lookup + * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce + * memory requirements with large number of sections. + * 51 bits is the max physical real address on POWER9 + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) +#define H_MAX_PHYSMEM_BITS 51 +#else +#define H_MAX_PHYSMEM_BITS 46 +#endif /* * Each context is 512TB size. SLB miss for first context/default context diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 93d18da5e7ec..683a9c7d1b03 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -577,8 +577,8 @@ extern void slb_set_size(u16 size); * For vmalloc and memmap, we use just one context with 512TB. With 64 byte * struct page size, we need ony 32 TB in memmap for 2PB (51 bits (MAX_PHYSMEM_BITS)). */ -#if (MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT) -#define MAX_KERNEL_CTX_CNT (1UL << (MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT)) +#if (H_MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT) +#define MAX_KERNEL_CTX_CNT (1UL << (H_MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT)) #else #define MAX_KERNEL_CTX_CNT 1 #endif diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index b392384a3b15..e0b52940e43c 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -27,21 +27,6 @@ struct mmu_psize_def { extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; #endif /* __ASSEMBLY__ */ -/* - * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS - * if we increase SECTIONS_WIDTH we will not store node details in page->flags and - * page_to_nid does a page->section->node lookup - * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce - * memory requirements with large number of sections. - * 51 bits is the max physical real address on POWER9 - */ -#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ - defined(CONFIG_PPC_64K_PAGES) -#define MAX_PHYSMEM_BITS 51 -#else -#define MAX_PHYSMEM_BITS 46 -#endif - /* 64-bit classic hash table MMU */ #include <asm/book3s/64/mmu-hash.h> @@ -85,7 +70,7 @@ extern unsigned int mmu_base_pid; /* * memory block size used with radix translation. */ -extern unsigned int __ro_after_init radix_mem_block_size; +extern unsigned long __ro_after_init radix_mem_block_size; #define PRTB_SIZE_SHIFT (mmu_pid_bits + 4) #define PRTB_ENTRIES (1ul << mmu_pid_bits) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 495fc0ccb453..cd3feeac6e87 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -294,6 +294,13 @@ extern unsigned long pci_io_base; #include <asm/book3s/64/hash.h> #include <asm/book3s/64/radix.h> +#if H_MAX_PHYSMEM_BITS > R_MAX_PHYSMEM_BITS +#define MAX_PHYSMEM_BITS H_MAX_PHYSMEM_BITS +#else +#define MAX_PHYSMEM_BITS R_MAX_PHYSMEM_BITS +#endif + + #ifdef CONFIG_PPC_64K_PAGES #include <asm/book3s/64/pgtable-64k.h> #else @@ -615,7 +622,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) VM_BUG_ON(pfn >> (64 - PAGE_SHIFT)); VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK); - return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } static inline unsigned long pte_pfn(pte_t pte) @@ -651,11 +658,6 @@ static inline pte_t pte_mkexec(pte_t pte) return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); -} - static inline pte_t pte_mkwrite(pte_t pte) { /* @@ -819,6 +821,14 @@ static inline int pte_none(pte_t pte) static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, int percpu) { + + VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE))); + /* + * Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE + * in all the callers. + */ + pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); + if (radix_enabled()) return radix__set_pte_at(mm, addr, ptep, pte, percpu); return hash__set_pte_at(mm, addr, ptep, pte, percpu); @@ -866,6 +876,13 @@ static inline bool pte_ci(pte_t pte) static inline void pmd_clear(pmd_t *pmdp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* + * Don't use this if we can possibly have a hash page table + * entry mapping this. + */ + WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pmdp = __pmd(0); } @@ -914,6 +931,13 @@ static inline int pmd_bad(pmd_t pmd) static inline void pud_clear(pud_t *pudp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* + * Don't use this if we can possibly have a hash page table + * entry mapping this. + */ + WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pudp = __pud(0); } diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 0cba794c4fb8..c7813dc628fc 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -91,6 +91,22 @@ * +------------------------------+ Kernel linear (0xc.....) */ + +/* + * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS + * if we increase SECTIONS_WIDTH we will not store node details in page->flags and + * page_to_nid does a page->section->node lookup + * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce + * memory requirements with large number of sections. + * 51 bits is the max physical real address on POWER9 + */ + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) +#define R_MAX_PHYSMEM_BITS 51 +#else +#define R_MAX_PHYSMEM_BITS 46 +#endif + #define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) /* * 49 = MAX_EA_BITS_PER_CONTEXT (hash specific). To make sure we pick diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 54764c6e922d..138e46d8c04e 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -98,6 +98,16 @@ static inline void invalidate_dcache_range(unsigned long start, mb(); /* sync */ } +#ifdef CONFIG_4xx +static inline void flush_instruction_cache(void) +{ + iccci((void *)KERNELBASE); + isync(); +} +#else +void flush_instruction_cache(void); +#endif + #include <asm-generic/cacheflush.h> #endif /* _ASM_POWERPC_CACHEFLUSH_H */ diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 32a15dc49e8c..93bc70d4c9a1 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -9,11 +9,6 @@ #ifndef __ASSEMBLY__ -/* - * Added to include __machine_check_early_realmode_* functions - */ -#include <asm/mce.h> - /* This structure can grow, it's real size is used by head.S code * via the mkdefs mechanism. */ @@ -170,6 +165,7 @@ static inline void cpu_feature_keys_init(void) { } #else /* CONFIG_PPC32 */ /* Define these to 0 for the sake of tests in common code */ #define CPU_FTR_PPC_LE (0) +#define CPU_FTR_SPE (0) #endif /* @@ -299,8 +295,6 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_MAYBE_CAN_NAP 0 #endif -#define CPU_FTRS_PPC601 (CPU_FTR_COMMON | \ - CPU_FTR_COHERENT_ICACHE) #define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \ CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE) #define CPU_FTRS_604 (CPU_FTR_COMMON | CPU_FTR_PPC_LE) @@ -516,10 +510,8 @@ static inline void cpu_feature_keys_init(void) { } #else enum { CPU_FTRS_POSSIBLE = -#ifdef CONFIG_PPC_BOOK3S_601 - CPU_FTRS_PPC601 | -#elif defined(CONFIG_PPC_BOOK3S_32) - CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU | +#ifdef CONFIG_PPC_BOOK3S_32 + CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU | CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 | CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX | CPU_FTRS_7400_NOTAU | CPU_FTRS_7400 | CPU_FTRS_7450_20 | @@ -594,9 +586,7 @@ enum { #else enum { CPU_FTRS_ALWAYS = -#ifdef CONFIG_PPC_BOOK3S_601 - CPU_FTRS_PPC601 & -#elif defined(CONFIG_PPC_BOOK3S_32) +#ifdef CONFIG_PPC_BOOK3S_32 CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU & CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 & CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX & diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index deb99fd6e060..98c8bd155bf9 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -23,7 +23,6 @@ extern int threads_per_core; extern int threads_per_subcore; extern int threads_shift; -extern bool has_big_cores; extern cpumask_t threads_core_mask; #else #define threads_per_core 1 diff --git a/arch/powerpc/include/asm/delay.h b/arch/powerpc/include/asm/delay.h index 66963f7d3e64..51bb8c1476c7 100644 --- a/arch/powerpc/include/asm/delay.h +++ b/arch/powerpc/include/asm/delay.h @@ -54,7 +54,7 @@ extern void udelay(unsigned long usecs); ({ \ typeof(condition) __ret; \ unsigned long __loops = tb_ticks_per_usec * timeout; \ - unsigned long __start = get_tbl(); \ + unsigned long __start = mftb(); \ \ if (delay) { \ while (!(__ret = (condition)) && \ diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h index abc154d784b0..128304cbee1d 100644 --- a/arch/powerpc/include/asm/dma-direct.h +++ b/arch/powerpc/include/asm/dma-direct.h @@ -2,12 +2,12 @@ #ifndef ASM_POWERPC_DMA_DIRECT_H #define ASM_POWERPC_DMA_DIRECT_H 1 -static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { return paddr + dev->archdata.dma_offset; } -static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { return daddr - dev->archdata.dma_offset; } diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 17ccc6474ab6..bf2402fed3e0 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -8,26 +8,39 @@ #ifndef _ASM_POWERPC_LMB_H #define _ASM_POWERPC_LMB_H +#include <linux/sched.h> + struct drmem_lmb { u64 base_addr; u32 drc_index; u32 aa_index; u32 flags; -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; -#endif }; struct drmem_lmb_info { struct drmem_lmb *lmbs; int n_lmbs; - u32 lmb_size; + u64 lmb_size; }; extern struct drmem_lmb_info *drmem_info; +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb, + const struct drmem_lmb *start) +{ + /* + * DLPAR code paths can take several milliseconds per element + * when interacting with firmware. Ensure that we don't + * unfairly monopolize the CPU. + */ + if (((++lmb - start) % 16) == 0) + cond_resched(); + + return lmb; +} + #define for_each_drmem_lmb_in_range(lmb, start, end) \ - for ((lmb) = (start); (lmb) < (end); (lmb)++) + for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start)) #define for_each_drmem_lmb(lmb) \ for_each_drmem_lmb_in_range((lmb), \ @@ -67,7 +80,7 @@ struct of_drconf_cell_v2 { #define DRCONF_MEM_RESERVED 0x00000080 #define DRCONF_MEM_HOTREMOVABLE 0x00000100 -static inline u32 drmem_lmb_size(void) +static inline u64 drmem_lmb_size(void) { return drmem_info->lmb_size; } @@ -105,22 +118,4 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) lmb->aa_index = 0xffffffff; } -#ifdef CONFIG_MEMORY_HOTPLUG -static inline void lmb_set_nid(struct drmem_lmb *lmb) -{ - lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr); -} -static inline void lmb_clear_nid(struct drmem_lmb *lmb) -{ - lmb->nid = -1; -} -#else -static inline void lmb_set_nid(struct drmem_lmb *lmb) -{ -} -static inline void lmb_clear_nid(struct drmem_lmb *lmb) -{ -} -#endif - #endif /* _ASM_POWERPC_LMB_H */ diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index d5f369bcd130..b1a5bba2e0b9 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -27,7 +27,6 @@ struct pci_dn; #define EEH_FORCE_DISABLED 0x02 /* EEH disabled */ #define EEH_PROBE_MODE_DEV 0x04 /* From PCI device */ #define EEH_PROBE_MODE_DEVTREE 0x08 /* From device tree */ -#define EEH_VALID_PE_ZERO 0x10 /* PE#0 is valid */ #define EEH_ENABLE_IO_FOR_LOG 0x20 /* Enable IO for log */ #define EEH_EARLY_DUMP_LOG 0x40 /* Dump log immediately */ @@ -74,7 +73,6 @@ struct pci_dn; struct eeh_pe { int type; /* PE type: PHB/Bus/Device */ int state; /* PE EEH dependent mode */ - int config_addr; /* Traditional PCI address */ int addr; /* PE configuration address */ struct pci_controller *phb; /* Associated PHB */ struct pci_bus *bus; /* Top PCI bus for bus PE */ @@ -216,7 +214,6 @@ enum { struct eeh_ops { char *name; - int (*init)(void); struct eeh_dev *(*probe)(struct pci_dev *pdev); int (*set_option)(struct eeh_pe *pe, int option); int (*get_state)(struct eeh_pe *pe, int *delay); @@ -281,8 +278,7 @@ int eeh_phb_pe_create(struct pci_controller *phb); int eeh_wait_state(struct eeh_pe *pe, int max_wait); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); -struct eeh_pe *eeh_pe_get(struct pci_controller *phb, - int pe_no, int config_addr); +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no); int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent); int eeh_pe_tree_remove(struct eeh_dev *edev); void eeh_pe_update_time_stamp(struct eeh_pe *pe); @@ -295,8 +291,7 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe); struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); void eeh_show_enabled(void); -int __init eeh_ops_register(struct eeh_ops *ops); -int __exit eeh_ops_unregister(const char *name); +int __init eeh_init(struct eeh_ops *ops); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); void eeh_addr_cache_init(void); diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index fbb377055471..c1fbccb04390 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -375,11 +375,13 @@ #define H_CPU_CHAR_THREAD_RECONFIG_CTRL (1ull << 57) // IBM bit 6 #define H_CPU_CHAR_COUNT_CACHE_DISABLED (1ull << 56) // IBM bit 7 #define H_CPU_CHAR_BCCTR_FLUSH_ASSIST (1ull << 54) // IBM bit 9 +#define H_CPU_CHAR_BCCTR_LINK_FLUSH_ASSIST (1ull << 52) // IBM bit 11 #define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0 #define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1 #define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2 #define H_CPU_BEHAV_FLUSH_COUNT_CACHE (1ull << 58) // IBM bit 5 +#define H_CPU_BEHAV_FLUSH_LINK_STACK (1ull << 57) // IBM bit 6 /* Flag values used in H_REGISTER_PROC_TBL hcall */ #define PROC_TABLE_OP_MASK 0x18 @@ -560,6 +562,42 @@ struct hv_guest_state { /* Latest version of hv_guest_state structure */ #define HV_GUEST_STATE_VERSION 1 +/* + * From the document "H_GetPerformanceCounterInfo Interface" v1.07 + * + * H_GET_PERF_COUNTER_INFO argument + */ +struct hv_get_perf_counter_info_params { + __be32 counter_request; /* I */ + __be32 starting_index; /* IO */ + __be16 secondary_index; /* IO */ + __be16 returned_values; /* O */ + __be32 detail_rc; /* O, only needed when called via *_norets() */ + + /* + * O, size each of counter_value element in bytes, only set for version + * >= 0x3 + */ + __be16 cv_element_size; + + /* I, 0 (zero) for versions < 0x3 */ + __u8 counter_info_version_in; + + /* O, 0 (zero) if version < 0x3. Must be set to 0 when making hcall */ + __u8 counter_info_version_out; + __u8 reserved[0xC]; + __u8 counter_value[]; +} __packed; + +#define HGPCI_REQ_BUFFER_SIZE 4096 +#define HGPCI_MAX_DATA_BYTES \ + (HGPCI_REQ_BUFFER_SIZE - sizeof(struct hv_get_perf_counter_info_params)) + +struct hv_gpci_request_buffer { + struct hv_get_perf_counter_info_params params; + uint8_t bytes[HGPCI_MAX_DATA_BYTES]; +} __packed; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index db206a7f38e2..abebfbee5b1c 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -10,6 +10,7 @@ #define _PPC_BOOK3S_64_HW_BREAKPOINT_H #include <asm/cpu_has_feature.h> +#include <asm/inst.h> #ifdef __KERNEL__ struct arch_hw_breakpoint { @@ -17,6 +18,7 @@ struct arch_hw_breakpoint { u16 type; u16 len; /* length of the target data symbol */ u16 hw_len; /* length programmed in hw */ + u8 flags; }; /* Note: Don't change the first 6 bits below as they are in the same order @@ -36,12 +38,15 @@ struct arch_hw_breakpoint { #define HW_BRK_TYPE_PRIV_ALL (HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL | \ HW_BRK_TYPE_HYP) +#define HW_BRK_FLAG_DISABLED 0x1 + /* Minimum granularity */ #ifdef CONFIG_PPC_8xx #define HW_BREAKPOINT_SIZE 0x4 #else #define HW_BREAKPOINT_SIZE 0x8 #endif +#define HW_BREAKPOINT_SIZE_QUADWORD 0x10 #define DABR_MAX_LEN 8 #define DAWR_MAX_LEN 512 @@ -51,6 +56,13 @@ static inline int nr_wp_slots(void) return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1; } +bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr, + unsigned long ea, int type, int size, + struct arch_hw_breakpoint *info); + +void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, + int *type, int *size, unsigned long *ea); + #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <linux/kdebug.h> #include <asm/reg.h> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 35060be09073..0363734ff56e 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -25,9 +25,8 @@ #define PACA_IRQ_DBELL 0x02 #define PACA_IRQ_EE 0x04 #define PACA_IRQ_DEC 0x08 /* Or FIT */ -#define PACA_IRQ_EE_EDGE 0x10 /* BookE only */ -#define PACA_IRQ_HMI 0x20 -#define PACA_IRQ_PMI 0x40 +#define PACA_IRQ_HMI 0x10 +#define PACA_IRQ_PMI 0x20 /* * Some soft-masked interrupts must be hard masked until they are replayed @@ -369,12 +368,6 @@ static inline void may_hard_irq_enable(void) { } #define ARCH_IRQ_INIT_FLAGS IRQ_NOREQUEST -/* - * interrupt-retrigger: should we handle this via lost interrupts and IPIs - * or should we not care like we do now ? --BenH. - */ -struct irq_chip; - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h index b0c70a35fd0e..f6599ccb3012 100644 --- a/arch/powerpc/include/asm/icswx.h +++ b/arch/powerpc/include/asm/icswx.h @@ -156,8 +156,7 @@ struct coprocessor_request_block { u8 reserved[32]; struct coprocessor_status_block csb; -} __packed; - +} __aligned(128); /* RFC02167 Initiate Coprocessor Instructions document * Chapter 8.2.1.1.1 RS @@ -188,6 +187,9 @@ static inline int icswx(__be32 ccw, struct coprocessor_request_block *crb) __be64 ccw_reg = ccw; u32 cr; + /* NB: the same structures are used by VAS-NX */ + BUILD_BUG_ON(sizeof(*crb) != 128); + __asm__ __volatile__( PPC_ICSWX(%1,0,%2) "\n" "mfcr %0\n" diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 5032f1593299..deef7c94d7b6 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -12,7 +12,7 @@ #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/device.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/bitops.h> #include <asm/machdep.h> #include <asm/types.h> diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 814dfab7e392..4f983ca4030a 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -35,7 +35,6 @@ static __inline__ int irq_canonicalize(int irq) extern int distribute_irqs; -struct irqaction; struct pt_regs; #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 10ded83414de..d67a470e95a3 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -326,6 +326,7 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; + struct kvmppc_xics *xics_device; struct kvmppc_xive *xive; /* Current XIVE device in use */ struct { struct kvmppc_xive *native; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index a90b892f0bfe..95081078aa8a 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,7 +65,6 @@ struct machdep_calls { void __noreturn (*restart)(char *cmd); void __noreturn (*halt)(void); void (*panic)(char *str); - void (*cpu_die)(void); long (*time_init)(void); /* Optional, may be NULL */ @@ -222,8 +221,6 @@ struct machdep_calls { extern void e500_idle(void); extern void power4_idle(void); -extern void power7_idle(void); -extern void power9_idle(void); extern void ppc6xx_idle(void); extern void book3e_idle(void); diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 7f3658a97384..e02aa793420b 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -244,7 +244,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { - switch_mm(prev, next, current); + switch_mm_irqs_off(prev, next, current); } /* We don't currently use enter_lazy_tlb() for anything */ diff --git a/arch/powerpc/kernel/module.lds b/arch/powerpc/include/asm/module.lds.h index cea5dc124be4..cea5dc124be4 100644 --- a/arch/powerpc/kernel/module.lds +++ b/arch/powerpc/include/asm/module.lds.h diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index e752a5807a59..39be9aea86db 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -65,4 +65,18 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep, clr, set, 1); } +#ifdef CONFIG_PPC_4K_PAGES +static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t size = huge_page_size(hstate_vma(vma)); + + if (size == SZ_16K) + return __pte(pte_val(entry) & ~_PAGE_HUGE); + else + return entry; +} +#define arch_make_huge_pte arch_make_huge_pte +#endif + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b9e134d0f03a..ee2243ba96cf 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -227,6 +227,19 @@ static inline void pmd_clear(pmd_t *pmdp) */ #ifdef CONFIG_PPC_8xx static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr); +static int hugepd_ok(hugepd_t hpd); + +static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge) +{ + if (!huge) + return PAGE_SIZE / SZ_4K; + else if (hugepd_ok(*((hugepd_t *)pmd))) + return 1; + else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE)) + return SZ_16K / SZ_4K; + else + return SZ_512K / SZ_4K; +} static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, unsigned long clr, unsigned long set, int huge) @@ -237,12 +250,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p int num, i; pmd_t *pmd = pmd_off(mm, addr); - if (!huge) - num = PAGE_SIZE / SZ_4K; - else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) != _PMD_PAGE_8M) - num = SZ_512K / SZ_4K; - else - num = 1; + num = number_of_cells_per_pte(pmd, new, huge); for (i = 0; i < num; i++, entry++, new += SZ_4K) *entry = new; diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 4b7c3472eab1..6277e7596ae5 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -140,11 +140,6 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return pte; -} - static inline pte_t pte_mkspecial(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPECIAL); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 63ed7e3b0ba3..6436f0b41539 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -9,7 +9,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/string.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/scatterlist.h> #include <asm/machdep.h> diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h index ee79d2cd9fb6..d37ededca3ee 100644 --- a/arch/powerpc/include/asm/pnv-ocxl.h +++ b/arch/powerpc/include/asm/pnv-ocxl.h @@ -28,7 +28,4 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p void pnv_ocxl_spa_release(void *platform_data); int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle); -int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr); -void pnv_ocxl_free_xive_irq(u32 irq); - #endif /* _ASM_PNV_OCXL_H */ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index b4cc6608131c..511786f0e40d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -382,16 +382,6 @@ n: #endif /* various errata or part fixups */ -#ifdef CONFIG_PPC601_SYNC_FIX -#define SYNC sync; isync -#define SYNC_601 sync -#define ISYNC_601 isync -#else -#define SYNC -#define SYNC_601 -#define ISYNC_601 -#endif - #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define MFTB(dest) \ 90: mfspr dest, SPRN_TBRL; \ @@ -411,8 +401,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #define MFTBU(dest) mfspr dest, SPRN_TBRU #endif -/* tlbsync is not implemented on 601 */ -#if !defined(CONFIG_SMP) || defined(CONFIG_PPC_BOOK3S_601) +#ifndef CONFIG_SMP #define TLBSYNC #else #define TLBSYNC tlbsync; sync diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index ed0d633ab5aa..c61c859b51a8 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -83,10 +83,6 @@ struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); void release_thread(struct task_struct *); -typedef struct { - unsigned long seg; -} mm_segment_t; - #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] @@ -148,7 +144,6 @@ struct thread_struct { unsigned long ksp_vsid; #endif struct pt_regs *regs; /* Pointer to saved register state */ - mm_segment_t addr_limit; /* for get_fs() validation */ #ifdef CONFIG_BOOKE /* BookE base exception scratch space; align on cacheline */ unsigned long normsave[8] ____cacheline_aligned; @@ -220,6 +215,7 @@ struct thread_struct { unsigned long tm_tar; unsigned long tm_ppr; unsigned long tm_dscr; + unsigned long tm_amr; /* * Checkpointed FP and VSX 0-31 register set. @@ -295,7 +291,6 @@ struct thread_struct { #define INIT_THREAD { \ .ksp = INIT_SP, \ .ksp_limit = INIT_SP_LIMIT, \ - .addr_limit = KERNEL_DS, \ .pgdir = swapper_pg_dir, \ .fpexc_mode = MSR_FE0 | MSR_FE1, \ SPEFSCR_INIT \ @@ -303,7 +298,6 @@ struct thread_struct { #else #define INIT_THREAD { \ .ksp = INIT_SP, \ - .addr_limit = KERNEL_DS, \ .fpexc_mode = 0, \ } #endif @@ -432,16 +426,10 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); -extern void power9_idle_type(unsigned long stop_psscr_val, +extern void arch300_idle_type(unsigned long stop_psscr_val, unsigned long stop_psscr_mask); -extern void flush_instruction_cache(void); -extern void hard_reset_now(void); -extern void poweroff_now(void); extern int fix_alignment(struct pt_regs *); -extern void cvt_fd(float *from, double *to); -extern void cvt_df(double *from, float *to); -extern void _nmask_and_or_msr(unsigned long nmask, unsigned long or_val); #ifdef CONFIG_PPC64 /* diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 155a197c0aa1..e2c778c176a3 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -243,11 +243,7 @@ static inline void set_trap_norestart(struct pt_regs *regs) } #define arch_has_single_step() (1) -#ifndef CONFIG_PPC_BOOK3S_601 #define arch_has_block_step() (true) -#else -#define arch_has_block_step() (false) -#endif #define ARCH_HAS_USER_SINGLE_STEP_REPORT /* diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 88fb88491fe9..f877a576b338 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -521,6 +521,8 @@ #define SPRN_TSCR 0x399 /* Thread Switch Control Register */ #define SPRN_DEC 0x016 /* Decrement Register */ +#define SPRN_PIT 0x3DB /* Programmable Interval Timer (40x/BOOKE) */ + #define SPRN_DER 0x095 /* Debug Enable Register */ #define DER_RSTE 0x40000000 /* Reset Interrupt */ #define DER_CHSTPE 0x20000000 /* Check Stop */ @@ -817,7 +819,7 @@ #define THRM1_TIN (1 << 31) #define THRM1_TIV (1 << 30) #define THRM1_THRES(x) ((x&0x7f)<<23) -#define THRM3_SITV(x) ((x&0x3fff)<<1) +#define THRM3_SITV(x) ((x & 0x1fff) << 1) #define THRM1_TID (1<<2) #define THRM1_TIE (1<<1) #define THRM1_V (1<<0) @@ -1353,6 +1355,7 @@ #define PVR_POWER8NVL 0x004C #define PVR_POWER8 0x004D #define PVR_POWER9 0x004E +#define PVR_POWER10 0x0080 #define PVR_BE 0x0070 #define PVR_PA6T 0x0090 @@ -1416,8 +1419,7 @@ static inline void msr_check_and_clear(unsigned long bits) __msr_check_and_clear(bits); } -#ifdef __powerpc64__ -#if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_CELL) || defined(CONFIG_E500) #define mftb() ({unsigned long rval; \ asm volatile( \ "90: mfspr %0, %2;\n" \ @@ -1427,29 +1429,23 @@ static inline void msr_check_and_clear(unsigned long bits) : "=r" (rval) \ : "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \ rval;}) +#elif defined(CONFIG_PPC_8xx) +#define mftb() ({unsigned long rval; \ + asm volatile("mftbl %0" : "=r" (rval)); rval;}) #else #define mftb() ({unsigned long rval; \ asm volatile("mfspr %0, %1" : \ "=r" (rval) : "i" (SPRN_TBRL)); rval;}) #endif /* !CONFIG_PPC_CELL */ -#else /* __powerpc64__ */ - #if defined(CONFIG_PPC_8xx) -#define mftbl() ({unsigned long rval; \ - asm volatile("mftbl %0" : "=r" (rval)); rval;}) #define mftbu() ({unsigned long rval; \ asm volatile("mftbu %0" : "=r" (rval)); rval;}) #else -#define mftbl() ({unsigned long rval; \ - asm volatile("mfspr %0, %1" : "=r" (rval) : \ - "i" (SPRN_TBRL)); rval;}) #define mftbu() ({unsigned long rval; \ asm volatile("mfspr %0, %1" : "=r" (rval) : \ "i" (SPRN_TBRU)); rval;}) #endif -#define mftb() mftbl() -#endif /* !__powerpc64__ */ #define mttbl(v) asm volatile("mttbl %0":: "r"(v)) #define mttbu(v) asm volatile("mttbu %0":: "r"(v)) diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index ff30f1076162..29a948e0c0f2 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -174,7 +174,6 @@ #define SPRN_L1CSR1 0x3F3 /* L1 Cache Control and Status Register 1 */ #define SPRN_MMUCSR0 0x3F4 /* MMU Control and Status Register 0 */ #define SPRN_MMUCFG 0x3F7 /* MMU Configuration Register */ -#define SPRN_PIT 0x3DB /* Programmable Interval Timer */ #define SPRN_BUCSR 0x3F5 /* Branch Unit Control and Status */ #define SPRN_L2CSR0 0x3F9 /* L2 Data Cache Control and Status Register 0 */ #define SPRN_L2CSR1 0x3FA /* L2 Data Cache Control and Status Register 1 */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 49a25e2400f2..b2035b2f57ce 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -28,8 +28,8 @@ extern int boot_cpuid; extern int spinning_secondaries; extern u32 *cpu_to_phys_id; +extern bool coregroup_enabled; -extern void cpu_die(void); extern int cpu_to_chip_id(int cpu); #ifdef CONFIG_SMP @@ -50,6 +50,9 @@ struct smp_ops_t { int (*cpu_disable)(void); void (*cpu_die)(unsigned int nr); int (*cpu_bootable)(unsigned int nr); +#ifdef CONFIG_HOTPLUG_CPU + void (*cpu_offline_self)(void); +#endif }; extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); @@ -118,11 +121,6 @@ static inline struct cpumask *cpu_sibling_mask(int cpu) return per_cpu(cpu_sibling_map, cpu); } -static inline struct cpumask *cpu_core_mask(int cpu) -{ - return per_cpu(cpu_core_map, cpu); -} - static inline struct cpumask *cpu_l2_cache_mask(int cpu) { return per_cpu(cpu_l2_cache_map, cpu); @@ -135,6 +133,19 @@ static inline struct cpumask *cpu_smallcore_mask(int cpu) extern int cpu_to_core_id(int cpu); +extern bool has_big_cores; + +#define cpu_smt_mask cpu_smt_mask +#ifdef CONFIG_SCHED_SMT +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + if (has_big_cores) + return per_cpu(cpu_smallcore_map, cpu); + + return per_cpu(cpu_sibling_map, cpu); +} +#endif /* CONFIG_SCHED_SMT */ + /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. * * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up @@ -243,7 +254,6 @@ extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); * 64-bit but defining them all here doesn't harm */ extern void generic_secondary_smp_init(void); -extern void generic_secondary_thread_init(void); extern unsigned long __secondary_hold_spinloop; extern unsigned long __secondary_hold_acknowledge; extern char __secondary_hold; diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index 85580b30aba4..7546402d796a 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -15,6 +15,8 @@ static inline bool is_secure_guest(void) return mfmsr() & MSR_S; } +void __init svm_swiotlb_init(void); + void dtl_cache_ctor(void *addr); #define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL) @@ -25,6 +27,8 @@ static inline bool is_secure_guest(void) return false; } +static inline void svm_swiotlb_init(void) {} + #define get_dtl_cache_ctor() NULL #endif /* CONFIG_PPC_SVM */ diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index aca70fb43147..1d67bc8d7bc6 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -3,8 +3,9 @@ #define _ASM_POWERPC_SYNCH_H #ifdef __KERNEL__ +#include <asm/cputable.h> #include <asm/feature-fixups.h> -#include <asm/asm-const.h> +#include <asm/ppc-opcode.h> #ifndef __ASSEMBLY__ extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup; @@ -20,6 +21,22 @@ static inline void isync(void) { __asm__ __volatile__ ("isync" : : : "memory"); } + +static inline void ppc_after_tlbiel_barrier(void) +{ + asm volatile("ptesync": : :"memory"); + /* + * POWER9, POWER10 need a cp_abort after tlbiel to ensure the copy is + * invalidated correctly. If this is not done, the paste can take data + * from the physical address that was translated at copy time. + * + * POWER9 in practice does not need this, because address spaces with + * accelerators mapped will use tlbie (which does invalidate the copy) + * to invalidate translations. It's not possible to limit POWER10 this + * way due to local copy-paste. + */ + asm volatile(ASM_FTR_IFSET(PPC_CP_ABORT, "", %0) : : "i" (CPU_FTR_ARCH_31) : "memory"); +} #endif /* __ASSEMBLY__ */ #if defined(__powerpc64__) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index ca6c97025704..46a210b03d2b 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -90,7 +90,6 @@ void arch_setup_new_exec(void); #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_FSCHECK 3 /* Check FS is USER_DS on return */ #define TIF_SYSCALL_EMU 4 /* syscall emulation active */ #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ #define TIF_PATCH_PENDING 6 /* pending live patching update */ @@ -130,7 +129,6 @@ void arch_setup_new_exec(void); #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) #define _TIF_NOHZ (1<<TIF_NOHZ) -#define _TIF_FSCHECK (1<<TIF_FSCHECK) #define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU) #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ @@ -138,8 +136,7 @@ void arch_setup_new_exec(void); #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ - _TIF_FSCHECK) + _TIF_RESTORE_TM | _TIF_PATCH_PENDING) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) /* Bits in local_flags */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index cb326720a8a1..2f566c1a754c 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -38,44 +38,10 @@ struct div_result { u64 result_low; }; -/* Accessor functions for the timebase (RTC on 601) registers. */ -#define __USE_RTC() (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) - -#ifdef CONFIG_PPC64 - /* For compatibility, get_tbl() is defined as get_tb() on ppc64 */ -#define get_tbl get_tb - -#else - static inline unsigned long get_tbl(void) { - return mftbl(); -} - -static inline unsigned int get_tbu(void) -{ - return mftbu(); -} -#endif /* !CONFIG_PPC64 */ - -static inline unsigned int get_rtcl(void) -{ - unsigned int rtcl; - - asm volatile("mfrtcl %0" : "=r" (rtcl)); - return rtcl; -} - -static inline u64 get_rtc(void) -{ - unsigned int hi, lo, hi2; - - do { - asm volatile("mfrtcu %0; mfrtcl %1; mfrtcu %2" - : "=r" (hi), "=r" (lo), "=r" (hi2)); - } while (hi2 != hi); - return (u64)hi * 1000000000 + lo; + return mftb(); } static inline u64 get_vtb(void) @@ -87,30 +53,21 @@ static inline u64 get_vtb(void) return 0; } -#ifdef CONFIG_PPC64 -static inline u64 get_tb(void) -{ - return mftb(); -} -#else /* CONFIG_PPC64 */ static inline u64 get_tb(void) { unsigned int tbhi, tblo, tbhi2; + if (IS_ENABLED(CONFIG_PPC64)) + return mftb(); + do { - tbhi = get_tbu(); - tblo = get_tbl(); - tbhi2 = get_tbu(); + tbhi = mftbu(); + tblo = mftb(); + tbhi2 = mftbu(); } while (tbhi != tbhi2); return ((u64)tbhi << 32) | tblo; } -#endif /* !CONFIG_PPC64 */ - -static inline u64 get_tb_or_rtc(void) -{ - return __USE_RTC() ? get_rtc() : get_tb(); -} static inline void set_tb(unsigned int upper, unsigned int lower) { @@ -127,11 +84,10 @@ static inline void set_tb(unsigned int upper, unsigned int lower) */ static inline u64 get_dec(void) { -#if defined(CONFIG_40x) - return (mfspr(SPRN_PIT)); -#else - return (mfspr(SPRN_DEC)); -#endif + if (IS_ENABLED(CONFIG_40x)) + return mfspr(SPRN_PIT); + + return mfspr(SPRN_DEC); } /* @@ -141,23 +97,17 @@ static inline u64 get_dec(void) */ static inline void set_dec(u64 val) { -#if defined(CONFIG_40x) - mtspr(SPRN_PIT, (u32) val); -#else -#ifndef CONFIG_BOOKE - --val; -#endif - mtspr(SPRN_DEC, val); -#endif /* not 40x */ + if (IS_ENABLED(CONFIG_40x)) + mtspr(SPRN_PIT, (u32)val); + else if (IS_ENABLED(CONFIG_BOOKE)) + mtspr(SPRN_DEC, val); + else + mtspr(SPRN_DEC, val - 1); } static inline unsigned long tb_ticks_since(unsigned long tstamp) { - if (__USE_RTC()) { - int delta = get_rtcl() - (unsigned int) tstamp; - return delta < 0 ? delta + 1000000000 : delta; - } - return get_tbl() - tstamp; + return mftb() - tstamp; } #define mulhwu(x,y) \ diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h index 6047402b0a4d..95988870a57b 100644 --- a/arch/powerpc/include/asm/timex.h +++ b/arch/powerpc/include/asm/timex.h @@ -17,9 +17,6 @@ typedef unsigned long cycles_t; static inline cycles_t get_cycles(void) { - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) - return 0; - return mftb(); } diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index fbc6f3002f23..d97f061fecac 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -66,19 +66,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm) return false; return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)); } -static inline void mm_reset_thread_local(struct mm_struct *mm) -{ - WARN_ON(atomic_read(&mm->context.copros) > 0); - /* - * It's possible for mm_access to take a reference on mm_users to - * access the remote mm from another thread, but it's not allowed - * to set mm_cpumask, so mm_users may be > 1 here. - */ - WARN_ON(current->mm != mm); - atomic_set(&mm->context.active_cpus, 1); - cpumask_clear(mm_cpumask(mm)); - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); -} #else /* CONFIG_PPC_BOOK3S_64 */ static inline int mm_is_thread_local(struct mm_struct *mm) { diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index f0b6300e7dd3..8728590f514a 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -86,14 +86,27 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) #endif /* CONFIG_NUMA */ +struct drmem_lmb; +int of_drconf_to_nid_single(struct drmem_lmb *lmb); + #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) extern int find_and_online_cpu_nid(int cpu); +extern int cpu_to_coregroup_id(int cpu); #else static inline int find_and_online_cpu_nid(int cpu) { return 0; } +static inline int cpu_to_coregroup_id(int cpu) +{ +#ifdef CONFIG_SMP + return cpu_to_core_id(cpu); +#else + return 0; +#endif +} + #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ #include <asm-generic/topology.h> @@ -104,15 +117,10 @@ static inline int find_and_online_cpu_nid(int cpu) #ifdef CONFIG_PPC64 #include <asm/smp.h> -#ifdef CONFIG_PPC_SPLPAR -int get_physical_package_id(int cpu); -#define topology_physical_package_id(cpu) (get_physical_package_id(cpu)) -#else #define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu)) -#endif #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) +#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) #endif diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 20a35373cafc..916daa4b4d0d 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -8,62 +8,21 @@ #include <asm/extable.h> #include <asm/kup.h> -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - * - * The fs/ds values are now the highest legal address in the "segment". - * This simplifies the checking in the routines below. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(~0UL) #ifdef __powerpc64__ /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */ -#define USER_DS MAKE_MM_SEG(TASK_SIZE_USER64 - 1) +#define TASK_SIZE_MAX TASK_SIZE_USER64 #else -#define USER_DS MAKE_MM_SEG(TASK_SIZE - 1) +#define TASK_SIZE_MAX TASK_SIZE #endif -#define get_fs() (current->thread.addr_limit) - -static inline void set_fs(mm_segment_t fs) +static inline bool __access_ok(unsigned long addr, unsigned long size) { - current->thread.addr_limit = fs; - /* On user-mode return check addr_limit (fs) is correct */ - set_thread_flag(TIF_FSCHECK); + return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr; } -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#define user_addr_max() (get_fs().seg) - -#ifdef __powerpc64__ -/* - * This check is sufficient because there is a large enough - * gap between user addresses and the kernel addresses - */ -#define __access_ok(addr, size, segment) \ - (((addr) <= (segment).seg) && ((size) <= (segment).seg)) - -#else - -static inline int __access_ok(unsigned long addr, unsigned long size, - mm_segment_t seg) -{ - if (addr > seg.seg) - return 0; - return (size == 0 || size - 1 <= seg.seg - addr); -} - -#endif - #define access_ok(addr, size) \ (__chk_user_ptr(addr), \ - __access_ok((__force unsigned long)(addr), (size), get_fs())) + __access_ok((unsigned long)(addr), (size))) /* * These are the main single-value transfer routines. They automatically @@ -151,52 +110,16 @@ static inline int __access_ok(unsigned long addr, unsigned long size, extern long __put_user_bad(void); -/* - * We don't tell gcc that we are accessing memory, but this is OK - * because we do not write to any memory gcc knows about, so there - * are no aliasing issues. - */ -#define __put_user_asm(x, addr, err, op) \ - __asm__ __volatile__( \ - "1: " op " %1,0(%2) # put_user\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: li %0,%3\n" \ - " b 2b\n" \ - ".previous\n" \ - EX_TABLE(1b, 3b) \ - : "=r" (err) \ - : "r" (x), "b" (addr), "i" (-EFAULT), "0" (err)) - -#ifdef __powerpc64__ -#define __put_user_asm2(x, ptr, retval) \ - __put_user_asm(x, ptr, retval, "std") -#else /* __powerpc64__ */ -#define __put_user_asm2(x, addr, err) \ - __asm__ __volatile__( \ - "1: stw %1,0(%2)\n" \ - "2: stw %1+1,4(%2)\n" \ - "3:\n" \ - ".section .fixup,\"ax\"\n" \ - "4: li %0,%3\n" \ - " b 3b\n" \ - ".previous\n" \ - EX_TABLE(1b, 4b) \ - EX_TABLE(2b, 4b) \ - : "=r" (err) \ - : "r" (x), "b" (addr), "i" (-EFAULT), "0" (err)) -#endif /* __powerpc64__ */ - #define __put_user_size_allowed(x, ptr, size, retval) \ do { \ + __label__ __pu_failed; \ + \ retval = 0; \ - switch (size) { \ - case 1: __put_user_asm(x, ptr, retval, "stb"); break; \ - case 2: __put_user_asm(x, ptr, retval, "sth"); break; \ - case 4: __put_user_asm(x, ptr, retval, "stw"); break; \ - case 8: __put_user_asm2(x, ptr, retval); break; \ - default: __put_user_bad(); \ - } \ + __put_user_size_goto(x, ptr, size, __pu_failed); \ + break; \ + \ +__pu_failed: \ + retval = -EFAULT; \ } while (0) #define __put_user_size(x, ptr, size, retval) \ @@ -249,12 +172,17 @@ do { \ }) +/* + * We don't tell gcc that we are accessing memory, but this is OK + * because we do not write to any memory gcc knows about, so there + * are no aliasing issues. + */ #define __put_user_asm_goto(x, addr, label, op) \ asm volatile goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ - : "r" (x), "m" (*addr) \ + : "r" (x), "m<>" (*addr) \ : \ : label) @@ -316,7 +244,7 @@ extern long __get_user_bad(void); #define __get_user_asm(x, addr, err, op) \ __asm__ __volatile__( \ - "1: "op" %1,0(%2) # get_user\n" \ + "1: "op"%U2%X2 %1, %2 # get_user\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: li %0,%3\n" \ @@ -325,7 +253,7 @@ extern long __get_user_bad(void); ".previous\n" \ EX_TABLE(1b, 3b) \ : "=r" (err), "=r" (x) \ - : "b" (addr), "i" (-EFAULT), "0" (err)) + : "m<>" (*addr), "i" (-EFAULT), "0" (err)) #ifdef __powerpc64__ #define __get_user_asm2(x, addr, err) \ @@ -333,8 +261,8 @@ extern long __get_user_bad(void); #else /* __powerpc64__ */ #define __get_user_asm2(x, addr, err) \ __asm__ __volatile__( \ - "1: lwz %1,0(%2)\n" \ - "2: lwz %1+1,4(%2)\n" \ + "1: lwz%X2 %1, %2\n" \ + "2: lwz%X2 %L1, %L2\n" \ "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: li %0,%3\n" \ @@ -345,7 +273,7 @@ extern long __get_user_bad(void); EX_TABLE(1b, 4b) \ EX_TABLE(2b, 4b) \ : "=r" (err), "=&r" (x) \ - : "b" (addr), "i" (-EFAULT), "0" (err)) + : "m" (*addr), "i" (-EFAULT), "0" (err)) #endif /* __powerpc64__ */ #define __get_user_size_allowed(x, ptr, size, retval) \ @@ -355,10 +283,10 @@ do { \ if (size > sizeof(x)) \ (x) = __get_user_bad(); \ switch (size) { \ - case 1: __get_user_asm(x, ptr, retval, "lbz"); break; \ - case 2: __get_user_asm(x, ptr, retval, "lhz"); break; \ - case 4: __get_user_asm(x, ptr, retval, "lwz"); break; \ - case 8: __get_user_asm2(x, ptr, retval); break; \ + case 1: __get_user_asm(x, (u8 __user *)ptr, retval, "lbz"); break; \ + case 2: __get_user_asm(x, (u16 __user *)ptr, retval, "lhz"); break; \ + case 4: __get_user_asm(x, (u32 __user *)ptr, retval, "lwz"); break; \ + case 8: __get_user_asm2(x, (u64 __user *)ptr, retval); break; \ default: (x) = __get_user_bad(); \ } \ } while (0) @@ -635,4 +563,20 @@ do { \ __put_user_goto(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e);\ } while (0) +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __kr_err; \ + \ + __get_user_size_allowed(*((type *)(dst)), (__force type __user *)(src),\ + sizeof(type), __kr_err); \ + if (unlikely(__kr_err)) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ + __put_user_size_goto(*((type *)(src)), \ + (__force type __user *)(dst), sizeof(type), err_label) + #endif /* _ARCH_POWERPC_UACCESS_H */ diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h index f5f1ccc740fc..7004cfea3f5f 100644 --- a/arch/powerpc/include/uapi/asm/ptrace.h +++ b/arch/powerpc/include/uapi/asm/ptrace.h @@ -222,6 +222,7 @@ struct ppc_debug_info { #define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x0000000000000004 #define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x0000000000000008 #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x0000000000000010 +#define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 0x0000000000000020 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index cbf41fb4ee89..bf0bf1b900d2 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -45,7 +45,8 @@ obj-y := cputable.o syscalls.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ - of_platform.o prom_parse.o firmware.o + of_platform.o prom_parse.o firmware.o \ + hw_breakpoint_constraints.o obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o note.o syscall_64.o @@ -94,7 +95,8 @@ obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o obj-$(CONFIG_PPC_DOORBELL) += dbell.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -extra-y := head_$(BITS).o +extra-$(CONFIG_PPC64) := head_64.o +extra-$(CONFIG_PPC_BOOK3S_32) := head_book3s_32.o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8711c2164b45..c2722ff36e98 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -176,6 +176,7 @@ int main(void) OFFSET(THREAD_TM_TAR, thread_struct, tm_tar); OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); + OFFSET(THREAD_TM_AMR, thread_struct, tm_amr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 02300edc6989..c22a8e0dbc93 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -95,19 +95,10 @@ void __init btext_prepare_BAT(void) boot_text_mapped = 0; return; } - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { - /* 603, 604, G3, G4, ... */ - lowbits = addr & ~0xFF000000UL; - addr &= 0xFF000000UL; - disp_BAT[0] = vaddr | (BL_16M<<2) | 2; - disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW); - } else { - /* 601 */ - lowbits = addr & ~0xFF800000UL; - addr &= 0xFF800000UL; - disp_BAT[0] = vaddr | (_PAGE_NO_CACHE | PP_RWXX) | 4; - disp_BAT[1] = addr | BL_8M | 0x40; - } + lowbits = addr & ~0xFF000000UL; + addr &= 0xFF000000UL; + disp_BAT[0] = vaddr | (BL_16M<<2) | 2; + disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW); logicalDisplayBase = (void *) (vaddr + lowbits); } #endif diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 2aa89c6b2896..492c0b36aff6 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -16,6 +16,7 @@ #include <asm/oprofile_impl.h> #include <asm/cputable.h> #include <asm/prom.h> /* for PTRRELOC on ARCH=ppc */ +#include <asm/mce.h> #include <asm/mmu.h> #include <asm/setup.h> @@ -608,21 +609,6 @@ static struct cpu_spec __initdata cpu_specs[] = { #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_BOOK3S_601 - { /* 601 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00010000, - .cpu_name = "601", - .cpu_features = CPU_FTRS_PPC601, - .cpu_user_features = COMMON_USER | PPC_FEATURE_601_INSTR | - PPC_FEATURE_UNIFIED_CACHE | PPC_FEATURE_NO_TB, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_generic, - .platform = "ppc601", - }, -#endif /* CONFIG_PPC_BOOK3S_601 */ #ifdef CONFIG_PPC_BOOK3S_6xx { /* 603 */ .pvr_mask = 0xffff0000, diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 9053fc9d20c7..a1c744194018 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -138,4 +138,6 @@ const struct dma_map_ops dma_iommu_ops = { .get_required_mask = dma_iommu_get_required_mask, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index f204ad79b6b5..1098863e17ee 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -17,6 +17,7 @@ #include <asm/cputable.h> #include <asm/dt_cpu_ftrs.h> +#include <asm/mce.h> #include <asm/mmu.h> #include <asm/oprofile_impl.h> #include <asm/prom.h> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 94682382fc8c..0e160dffcb86 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -466,7 +466,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) return 0; } - if (!pe->addr && !pe->config_addr) { + if (!pe->addr) { eeh_stats.no_cfg_addr++; return 0; } @@ -929,56 +929,6 @@ void eeh_save_bars(struct eeh_dev *edev) edev->config_space[1] |= PCI_COMMAND_MASTER; } -/** - * eeh_ops_register - Register platform dependent EEH operations - * @ops: platform dependent EEH operations - * - * Register the platform dependent EEH operation callback - * functions. The platform should call this function before - * any other EEH operations. - */ -int __init eeh_ops_register(struct eeh_ops *ops) -{ - if (!ops->name) { - pr_warn("%s: Invalid EEH ops name for %p\n", - __func__, ops); - return -EINVAL; - } - - if (eeh_ops && eeh_ops != ops) { - pr_warn("%s: EEH ops of platform %s already existing (%s)\n", - __func__, eeh_ops->name, ops->name); - return -EEXIST; - } - - eeh_ops = ops; - - return 0; -} - -/** - * eeh_ops_unregister - Unreigster platform dependent EEH operations - * @name: name of EEH platform operations - * - * Unregister the platform dependent EEH operation callback - * functions. - */ -int __exit eeh_ops_unregister(const char *name) -{ - if (!name || !strlen(name)) { - pr_warn("%s: Invalid EEH ops name\n", - __func__); - return -EINVAL; - } - - if (eeh_ops && !strcmp(eeh_ops->name, name)) { - eeh_ops = NULL; - return 0; - } - - return -EEXIST; -} - static int eeh_reboot_notifier(struct notifier_block *nb, unsigned long action, void *unused) { @@ -990,54 +940,6 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; -/** - * eeh_init - EEH initialization - * - * Initialize EEH by trying to enable it for all of the adapters in the system. - * As a side effect we can determine here if eeh is supported at all. - * Note that we leave EEH on so failed config cycles won't cause a machine - * check. If a user turns off EEH for a particular adapter they are really - * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't - * grant access to a slot if EEH isn't enabled, and so we always enable - * EEH for all slots/all devices. - * - * The eeh-force-off option disables EEH checking globally, for all slots. - * Even if force-off is set, the EEH hardware is still enabled, so that - * newer systems can boot. - */ -static int eeh_init(void) -{ - struct pci_controller *hose, *tmp; - int ret = 0; - - /* Register reboot notifier */ - ret = register_reboot_notifier(&eeh_reboot_nb); - if (ret) { - pr_warn("%s: Failed to register notifier (%d)\n", - __func__, ret); - return ret; - } - - /* call platform initialization function */ - if (!eeh_ops) { - pr_warn("%s: Platform EEH operation not found\n", - __func__); - return -EEXIST; - } else if ((ret = eeh_ops->init())) - return ret; - - /* Initialize PHB PEs */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) - eeh_phb_pe_create(hose); - - eeh_addr_cache_init(); - - /* Initialize EEH event */ - return eeh_event_init(); -} - -core_initcall_sync(eeh_init); - static int eeh_device_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -1062,12 +964,47 @@ static struct notifier_block eeh_device_nb = { .notifier_call = eeh_device_notifier, }; -static __init int eeh_set_bus_notifier(void) +/** + * eeh_init - System wide EEH initialization + * + * It's the platform's job to call this from an arch_initcall(). + */ +int eeh_init(struct eeh_ops *ops) { - bus_register_notifier(&pci_bus_type, &eeh_device_nb); - return 0; + struct pci_controller *hose, *tmp; + int ret = 0; + + /* the platform should only initialise EEH once */ + if (WARN_ON(eeh_ops)) + return -EEXIST; + if (WARN_ON(!ops)) + return -ENOENT; + eeh_ops = ops; + + /* Register reboot notifier */ + ret = register_reboot_notifier(&eeh_reboot_nb); + if (ret) { + pr_warn("%s: Failed to register reboot notifier (%d)\n", + __func__, ret); + return ret; + } + + ret = bus_register_notifier(&pci_bus_type, &eeh_device_nb); + if (ret) { + pr_warn("%s: Failed to register bus notifier (%d)\n", + __func__, ret); + return ret; + } + + /* Initialize PHB PEs */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + eeh_phb_pe_create(hose); + + eeh_addr_cache_init(); + + /* Initialize EEH event */ + return eeh_event_init(); } -arch_initcall(eeh_set_bus_notifier); /** * eeh_probe_device() - Perform EEH initialization for the indicated pci device @@ -1720,7 +1657,7 @@ static ssize_t eeh_force_recover_write(struct file *filp, return -ENODEV; /* Retrieve PE */ - pe = eeh_pe_get(hose, pe_no, 0); + pe = eeh_pe_get(hose, pe_no); if (!pe) return -ENODEV; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d2aaaa73fdd5..845e024321d4 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -251,43 +251,21 @@ void eeh_pe_dev_traverse(struct eeh_pe *root, /** * __eeh_pe_get - Check the PE address - * @data: EEH PE - * @flag: EEH device * * For one particular PE, it can be identified by PE address * or tranditional BDF address. BDF address is composed of * Bus/Device/Function number. The extra data referred by flag * indicates which type of address should be used. */ -struct eeh_pe_get_flag { - int pe_no; - int config_addr; -}; - static void *__eeh_pe_get(struct eeh_pe *pe, void *flag) { - struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag; + int *target_pe = flag; - /* Unexpected PHB PE */ + /* PHB PEs are special and should be ignored */ if (pe->type & EEH_PE_PHB) return NULL; - /* - * We prefer PE address. For most cases, we should - * have non-zero PE address - */ - if (eeh_has_flag(EEH_VALID_PE_ZERO)) { - if (tmp->pe_no == pe->addr) - return pe; - } else { - if (tmp->pe_no && - (tmp->pe_no == pe->addr)) - return pe; - } - - /* Try BDF address */ - if (tmp->config_addr && - (tmp->config_addr == pe->config_addr)) + if (*target_pe == pe->addr) return pe; return NULL; @@ -297,7 +275,6 @@ static void *__eeh_pe_get(struct eeh_pe *pe, void *flag) * eeh_pe_get - Search PE based on the given address * @phb: PCI controller * @pe_no: PE number - * @config_addr: Config address * * Search the corresponding PE based on the specified address which * is included in the eeh device. The function is used to check if @@ -306,16 +283,11 @@ static void *__eeh_pe_get(struct eeh_pe *pe, void *flag) * which is composed of PCI bus/device/function number, or unified * PE address. */ -struct eeh_pe *eeh_pe_get(struct pci_controller *phb, - int pe_no, int config_addr) +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no) { struct eeh_pe *root = eeh_phb_pe_get(phb); - struct eeh_pe_get_flag tmp = { pe_no, config_addr }; - struct eeh_pe *pe; - pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp); - - return pe; + return eeh_pe_traverse(root, __eeh_pe_get, &pe_no); } /** @@ -336,19 +308,13 @@ int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent) struct pci_controller *hose = edev->controller; struct eeh_pe *pe, *parent; - /* Check if the PE number is valid */ - if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { - eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n"); - return -EINVAL; - } - /* * Search the PE has been existing or not according * to the PE address. If that has been existing, the * PE should be composed of PCI bus and its subordinate * components. */ - pe = eeh_pe_get(hose, edev->pe_config_addr, edev->bdfn); + pe = eeh_pe_get(hose, edev->pe_config_addr); if (pe) { if (pe->type & EEH_PE_INVALID) { list_add_tail(&edev->entry, &pe->edevs); @@ -388,8 +354,8 @@ int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent) pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } - pe->addr = edev->pe_config_addr; - pe->config_addr = edev->bdfn; + + pe->addr = edev->pe_config_addr; /* * Put the new EEH PE into hierarchy tree. If the parent diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index f4d0af8e1136..8cdc8bcde703 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -234,7 +234,6 @@ transfer_to_handler_cont: mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r10 mtlr r9 - SYNC RFI /* jump to handler, enable MMU */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) @@ -264,7 +263,6 @@ _ASM_NOKPROBE_SYMBOL(transfer_to_handler_cont) LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r0 - SYNC RFI reenable_mmu: @@ -323,7 +321,6 @@ stack_ovf: #endif mtspr SPRN_SRR0,r9 mtspr SPRN_SRR1,r10 - SYNC RFI _ASM_NOKPROBE_SYMBOL(stack_ovf) #endif @@ -411,7 +408,6 @@ ret_from_syscall: /* disable interrupts so current_thread_info()->flags can't change */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ - SYNC mtmsr r10 lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO @@ -474,7 +470,6 @@ syscall_exit_finish: #endif mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - SYNC RFI _ASM_NOKPROBE_SYMBOL(syscall_exit_finish) #ifdef CONFIG_44x @@ -567,7 +562,6 @@ syscall_exit_work: * lockdep as we are supposed to have IRQs on at this point */ ori r10,r10,MSR_EE - SYNC mtmsr r10 /* Save NVGPRS if they're not saved already */ @@ -606,7 +600,6 @@ ret_from_kernel_syscall: #endif mtspr SPRN_SRR0, r9 mtspr SPRN_SRR1, r10 - SYNC RFI _ASM_NOKPROBE_SYMBOL(ret_from_kernel_syscall) @@ -810,7 +803,6 @@ fast_exception_return: REST_GPR(9, r11) REST_GPR(12, r11) lwz r11,GPR11(r11) - SYNC RFI _ASM_NOKPROBE_SYMBOL(fast_exception_return) @@ -819,19 +811,11 @@ _ASM_NOKPROBE_SYMBOL(fast_exception_return) 1: lis r3,exc_exit_restart_end@ha addi r3,r3,exc_exit_restart_end@l cmplw r12,r3 -#ifdef CONFIG_PPC_BOOK3S_601 - bge 2b -#else bge 3f -#endif lis r4,exc_exit_restart@ha addi r4,r4,exc_exit_restart@l cmplw r12,r4 -#ifdef CONFIG_PPC_BOOK3S_601 - blt 2b -#else blt 3f -#endif lis r3,fee_restarts@ha tophys(r3,r3) lwz r5,fee_restarts@l(r3) @@ -848,7 +832,6 @@ fee_restarts: /* aargh, a nonrecoverable interrupt, panic */ /* aargh, we don't know which trap this is */ -/* but the 601 doesn't implement the RI bit, so assume it's OK */ 3: li r10,-1 stw r10,_TRAP(r11) @@ -872,7 +855,6 @@ ret_from_except: * from the interrupt. */ /* Note: We don't bother telling lockdep about it */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - SYNC /* Some chip revs have problems here... */ mtmsr r10 /* disable interrupts */ lwz r3,_MSR(r1) /* Returning to user mode? */ @@ -1035,7 +1017,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) * exc_exit_restart below. -- paulus */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) - SYNC mtmsr r10 /* clear the RI bit */ .globl exc_exit_restart exc_exit_restart: @@ -1046,7 +1027,6 @@ exc_exit_restart: lwz r1,GPR1(r1) .globl exc_exit_restart_end exc_exit_restart_end: - SYNC RFI _ASM_NOKPROBE_SYMBOL(exc_exit_restart) _ASM_NOKPROBE_SYMBOL(exc_exit_restart_end) @@ -1274,7 +1254,6 @@ do_resched: /* r10 contains MSR_KERNEL here */ mfmsr r10 #endif ori r10,r10,MSR_EE - SYNC mtmsr r10 /* hard-enable interrupts */ bl schedule recheck: @@ -1283,7 +1262,6 @@ recheck: * TI_FLAGS aren't advertised. */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - SYNC mtmsr r10 /* disable interrupts */ lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED @@ -1292,7 +1270,6 @@ recheck: beq restore_user do_user_signal: /* r10 contains MSR_KERNEL here */ ori r10,r10,MSR_EE - SYNC mtmsr r10 /* hard-enable interrupts */ /* save r13-r31 in the exception frame, if not already done */ lwz r3,_TRAP(r1) @@ -1316,19 +1293,11 @@ nonrecoverable: lis r10,exc_exit_restart_end@ha addi r10,r10,exc_exit_restart_end@l cmplw r12,r10 -#ifdef CONFIG_PPC_BOOK3S_601 - bgelr -#else bge 3f -#endif lis r11,exc_exit_restart@ha addi r11,r11,exc_exit_restart@l cmplw r12,r11 -#ifdef CONFIG_PPC_BOOK3S_601 - bltlr -#else blt 3f -#endif lis r10,ee_restarts@ha lwz r12,ee_restarts@l(r10) addi r12,r12,1 @@ -1336,7 +1305,6 @@ nonrecoverable: mr r12,r11 /* restart at exc_exit_restart */ blr 3: /* OK, we can't recover, kill this process */ - /* but the 601 doesn't implement the RI bit, so assume it's OK */ lwz r3,_TRAP(r1) andi. r0,r3,1 beq 5f @@ -1382,8 +1350,7 @@ _GLOBAL(enter_rtas) mfmsr r9 stw r9,8(r1) LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) - SYNC /* disable interrupts so SRR0/1 */ - mtmsr r0 /* don't get trashed */ + mtmsr r0 /* disable interrupts so SRR0/1 don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 stw r7, THREAD + RTAS_SP(r2) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 733e40eba4eb..2f3846192ec7 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -430,7 +430,11 @@ _ASM_NOKPROBE_SYMBOL(save_nvgprs); #define FLUSH_COUNT_CACHE \ 1: nop; \ - patch_site 1b, patch__call_flush_branch_caches + patch_site 1b, patch__call_flush_branch_caches1; \ +1: nop; \ + patch_site 1b, patch__call_flush_branch_caches2; \ +1: nop; \ + patch_site 1b, patch__call_flush_branch_caches3 .macro nops number .rept \number @@ -512,7 +516,7 @@ _GLOBAL(_switch) kuap_check_amr r9, r10 - FLUSH_COUNT_CACHE + FLUSH_COUNT_CACHE /* Clobbers r9, ctr */ /* * On SMP kernels, care must be taken because a task may be diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index d9ed79415100..f579ce46eef2 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -988,7 +988,6 @@ kernel_dbg_exc: .endm masked_interrupt_book3e_0x500: - // XXX When adding support for EPR, use PACA_IRQ_EE_EDGE masked_interrupt_book3e PACA_IRQ_EE 1 masked_interrupt_book3e_0x900: @@ -1303,16 +1302,6 @@ fast_exception_return: addi r3,r1,STACK_FRAME_OVERHEAD; bl do_IRQ b ret_from_except -1: cmpwi cr0,r3,0xf00 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl performance_monitor_exception - b ret_from_except -1: cmpwi cr0,r3,0xe60 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl handle_hmi_exception - b ret_from_except 1: cmpwi cr0,r3,0x900 bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 5cdf4168a61a..8482739d42f3 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -754,10 +754,8 @@ u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) void fadump_update_elfcore_header(char *bufp) { - struct elfhdr *elf; struct elf_phdr *phdr; - elf = (struct elfhdr *)bufp; bufp += sizeof(struct elfhdr); /* First note is a place holder for cpu notes info. */ diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 4ae39db70044..3ff9a8fafa46 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -87,7 +87,6 @@ BEGIN_FTR_SECTION oris r5,r5,MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - SYNC MTMSRD(r5) /* enable use of fpu now */ isync /* enable use of FP after return */ @@ -134,18 +133,3 @@ _GLOBAL(save_fpu) mffs fr0 stfd fr0,FPSTATE_FPSCR(r6) blr - -/* - * These are used in the alignment trap handler when emulating - * single-precision loads and stores. - */ - -_GLOBAL(cvt_fd) - lfs 0,0(r3) - stfd 0,0(r4) - blr - -_GLOBAL(cvt_df) - lfd 0,0(r3) - stfs 0,0(r4) - blr diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 9abec6cd099c..7c767765071d 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -40,48 +40,52 @@ .macro EXCEPTION_PROLOG_1 for_rtas=0 #ifdef CONFIG_VMAP_STACK - .ifeq \for_rtas - li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ - mtmsr r11 - isync - .endif - subi r11, r1, INT_FRAME_SIZE /* use r1 if kernel */ + mr r11, r1 + subi r1, r1, INT_FRAME_SIZE /* use r1 if kernel */ + beq 1f + mfspr r1,SPRN_SPRG_THREAD + lwz r1,TASK_STACK-THREAD(r1) + addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE #else - tophys(r11,r1) /* use tophys(r1) if kernel */ - subi r11, r11, INT_FRAME_SIZE /* alloc exc. frame */ -#endif + subi r11, r1, INT_FRAME_SIZE /* use r1 if kernel */ beq 1f mfspr r11,SPRN_SPRG_THREAD - tovirt_vmstack r11, r11 lwz r11,TASK_STACK-THREAD(r11) addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE - tophys_novmstack r11, r11 +#endif 1: + tophys_novmstack r11, r11 #ifdef CONFIG_VMAP_STACK - mtcrf 0x7f, r11 + mtcrf 0x7f, r1 bt 32 - THREAD_ALIGN_SHIFT, stack_overflow #endif .endm .macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0 -#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S) -BEGIN_MMU_FTR_SECTION +#ifdef CONFIG_VMAP_STACK mtcr r10 -FTR_SECTION_ELSE - stw r10, _CCR(r11) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) + li r10, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r10 + isync #else stw r10,_CCR(r11) /* save registers */ #endif mfspr r10, SPRN_SPRG_SCRATCH0 +#ifdef CONFIG_VMAP_STACK + stw r11,GPR1(r1) + stw r11,0(r1) + mr r11, r1 +#else + stw r1,GPR1(r11) + stw r1,0(r11) + tovirt(r1, r11) /* set new kernel sp */ +#endif stw r12,GPR12(r11) stw r9,GPR9(r11) stw r10,GPR10(r11) -#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S) -BEGIN_MMU_FTR_SECTION +#ifdef CONFIG_VMAP_STACK mfcr r10 stw r10, _CCR(r11) -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) #endif mfspr r12,SPRN_SPRG_SCRATCH1 stw r12,GPR11(r11) @@ -97,19 +101,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) stw r10, _DSISR(r11) .endif lwz r9, SRR1(r12) -#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S) -BEGIN_MMU_FTR_SECTION andi. r10, r9, MSR_PR -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -#endif lwz r12, SRR0(r12) #else mfspr r12,SPRN_SRR0 mfspr r9,SPRN_SRR1 #endif - stw r1,GPR1(r11) - stw r1,0(r11) - tovirt_novmstack r1, r11 /* set new kernel sp */ #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else @@ -225,7 +222,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) #endif mtspr SPRN_SRR1,r10 mtspr SPRN_SRR0,r11 - SYNC RFI /* jump to handler, enable MMU */ 99: b ret_from_kernel_syscall .endm @@ -327,20 +323,19 @@ label: .macro vmap_stack_overflow_exception #ifdef CONFIG_VMAP_STACK #ifdef CONFIG_SMP - mfspr r11, SPRN_SPRG_THREAD - tovirt(r11, r11) - lwz r11, TASK_CPU - THREAD(r11) - slwi r11, r11, 3 - addis r11, r11, emergency_ctx@ha + mfspr r1, SPRN_SPRG_THREAD + lwz r1, TASK_CPU - THREAD(r1) + slwi r1, r1, 3 + addis r1, r1, emergency_ctx@ha #else - lis r11, emergency_ctx@ha + lis r1, emergency_ctx@ha #endif - lwz r11, emergency_ctx@l(r11) - cmpwi cr1, r11, 0 + lwz r1, emergency_ctx@l(r1) + cmpwi cr1, r1, 0 bne cr1, 1f - lis r11, init_thread_union@ha - addi r11, r11, init_thread_union@l -1: addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE + lis r1, init_thread_union@ha + addi r1, r1, init_thread_union@l +1: addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE EXCEPTION_PROLOG_2 SAVE_NVGPRS(r11) addi r3, r1, STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 5b282d9965a5..44c9018aed1b 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -72,7 +72,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC rfi /* enables MMU */ b . /* prevent prefetch past rfi */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0e05a9a47a4b..1510b2a56669 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -300,9 +300,6 @@ _GLOBAL(fsl_secondary_thread_init) rlwimi r3, r3, 30, 2, 30 mtspr SPRN_PIR, r3 1: -#endif - -_GLOBAL(generic_secondary_thread_init) mr r24,r3 /* turn on 64-bit mode */ @@ -312,13 +309,13 @@ _GLOBAL(generic_secondary_thread_init) bl relative_toc tovirt(r2,r2) -#ifdef CONFIG_PPC_BOOK3E /* Book3E initialization */ mr r3,r24 bl book3e_secondary_thread_init -#endif b generic_secondary_common_init +#endif /* CONFIG_PPC_BOOK3E */ + /* * On pSeries and most other platforms, secondary processors spin * in the following code. diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_book3s_32.S index f3ab94d73936..5eb9eedac920 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -34,16 +34,6 @@ #include "head_32.h" -/* 601 only have IBAT */ -#ifdef CONFIG_PPC_BOOK3S_601 -#define LOAD_BAT(n, reg, RA, RB) \ - li RA,0; \ - mtspr SPRN_IBAT##n##U,RA; \ - lwz RA,(n*16)+0(reg); \ - lwz RB,(n*16)+4(reg); \ - mtspr SPRN_IBAT##n##U,RA; \ - mtspr SPRN_IBAT##n##L,RB -#else #define LOAD_BAT(n, reg, RA, RB) \ /* see the comment for clear_bats() -- Cort */ \ li RA,0; \ @@ -57,11 +47,10 @@ lwz RB,(n*16)+12(reg); \ mtspr SPRN_DBAT##n##U,RA; \ mtspr SPRN_DBAT##n##L,RB -#endif __HEAD .stabs "arch/powerpc/kernel/",N_SO,0,0,0f - .stabs "head_32.S",N_SO,0,0,0f + .stabs "head_book3s_32.S",N_SO,0,0,0f 0: _ENTRY(_stext); @@ -166,9 +155,9 @@ __after_mmu_off: bl initial_bats bl load_segment_registers -#ifdef CONFIG_KASAN +BEGIN_MMU_FTR_SECTION bl early_hash_table -#endif +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) #if defined(CONFIG_BOOTX_TEXT) bl setup_disp_bat #endif @@ -185,10 +174,8 @@ __after_mmu_off: bl reloc_offset li r24,0 /* cpu# */ bl call_setup_cpu /* Call setup_cpu for this CPU */ -#ifdef CONFIG_PPC_BOOK3S_32 bl reloc_offset bl init_idle_6xx -#endif /* CONFIG_PPC_BOOK3S_32 */ /* @@ -219,7 +206,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC RFI /* enables MMU */ /* @@ -274,14 +260,8 @@ __secondary_hold_acknowledge: DO_KVM 0x200 MachineCheck: EXCEPTION_PROLOG_0 -#ifdef CONFIG_VMAP_STACK - li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ - mtmsr r11 - isync -#endif #ifdef CONFIG_PPC_CHRP mfspr r11, SPRN_SPRG_THREAD - tovirt_vmstack r11, r11 lwz r11, RTAS_SP(r11) cmpwi cr1, r11, 0 bne cr1, 7f @@ -439,7 +419,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) SystemCall: SYSCALL_ENTRY 0xc00 -/* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) @@ -790,14 +769,12 @@ fast_hash_page_return: mtcr r11 lwz r11, THR11(r10) mfspr r10, SPRN_SPRG_SCRATCH0 - SYNC RFI 1: /* ISI */ mtcr r11 mfspr r11, SPRN_SPRG_SCRATCH1 mfspr r10, SPRN_SPRG_SCRATCH0 - SYNC RFI stack_overflow: @@ -888,7 +865,6 @@ __secondary_start_pmac_0: set to map the 0xf0000000 - 0xffffffff region */ mfmsr r0 rlwinm r0,r0,0,28,26 /* clear DR (0x10) */ - SYNC mtmsr r0 isync @@ -900,10 +876,8 @@ __secondary_start: lis r3,-KERNELBASE@h mr r4,r24 bl call_setup_cpu /* Call setup_cpu for this CPU */ -#ifdef CONFIG_PPC_BOOK3S_32 lis r3,-KERNELBASE@h bl init_idle_6xx -#endif /* CONFIG_PPC_BOOK3S_32 */ /* get current's stack and current */ lis r2,secondary_current@ha @@ -936,7 +910,6 @@ __secondary_start: ori r3,r3,start_secondary@l mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - SYNC RFI #endif /* CONFIG_SMP */ @@ -945,21 +918,9 @@ __secondary_start: #endif /* - * Those generic dummy functions are kept for CPUs not - * included in CONFIG_PPC_BOOK3S_32 - */ -#if !defined(CONFIG_PPC_BOOK3S_32) -_ENTRY(__save_cpu_setup) - blr -_ENTRY(__restore_cpu_setup) - blr -#endif /* !defined(CONFIG_PPC_BOOK3S_32) */ - -/* * Load stuff into the MMU. Intended to be called with * IR=0 and DR=0. */ -#ifdef CONFIG_KASAN early_hash_table: sync /* Force all PTE updates to finish */ isync @@ -970,8 +931,10 @@ early_hash_table: lis r6, early_hash - PAGE_OFFSET@h ori r6, r6, 3 /* 256kB table */ mtspr SPRN_SDR1, r6 + lis r6, early_hash@h + lis r3, Hash@ha + stw r6, Hash@l(r3) blr -#endif load_up_mmu: sync /* Force all PTE updates to finish */ @@ -985,8 +948,7 @@ load_up_mmu: lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 -/* Load the BAT registers with the values set up by MMU_init. - MMU_init takes care of whether we're on a 601 or not. */ +/* Load the BAT registers with the values set up by MMU_init. */ lis r3,BATS@ha addi r3,r3,BATS@l tophys(r3,r3) @@ -1002,7 +964,7 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr -load_segment_registers: +_GLOBAL(load_segment_registers) li r0, NUM_USER_SEGMENTS /* load up user segment register values */ mtctr r0 /* for context 0 */ li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ @@ -1061,11 +1023,7 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init -#ifdef CONFIG_KASAN -BEGIN_MMU_FTR_SECTION bl MMU_init_hw_patch -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -#endif /* * Go back to running unmapped so we can load up new values @@ -1080,7 +1038,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 - SYNC RFI /* Load up the kernel context */ 2: bl load_up_mmu @@ -1092,7 +1049,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) */ lis r5, abatron_pteptrs@h ori r5, r5, abatron_pteptrs@l - stw r5, 0xf0(r0) /* This much match your Abatron config */ + stw r5, 0xf0(0) /* This much match your Abatron config */ lis r6, swapper_pg_dir@h ori r6, r6, swapper_pg_dir@l tophys(r5, r5) @@ -1105,7 +1062,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) ori r3,r3,start_kernel@l mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - SYNC RFI /* @@ -1165,7 +1121,6 @@ EXPORT_SYMBOL(switch_mmu_context) clear_bats: li r10,0 -#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT0U,r10 mtspr SPRN_DBAT0L,r10 mtspr SPRN_DBAT1U,r10 @@ -1174,7 +1129,6 @@ clear_bats: mtspr SPRN_DBAT2L,r10 mtspr SPRN_DBAT3U,r10 mtspr SPRN_DBAT3L,r10 -#endif mtspr SPRN_IBAT0U,r10 mtspr SPRN_IBAT0L,r10 mtspr SPRN_IBAT1U,r10 @@ -1223,7 +1177,6 @@ _ENTRY(update_bats) .align 4 mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r3 - SYNC RFI 1: bl clear_bats lis r3, BATS@ha @@ -1243,7 +1196,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) mtmsr r3 mtspr SPRN_SRR0, r7 mtspr SPRN_SRR1, r6 - SYNC RFI flush_tlbs: @@ -1267,26 +1219,9 @@ mmu_off: sync RFI -/* - * On 601, we use 3 BATs to map up to 24M of RAM at _PAGE_OFFSET - * (we keep one for debugging) and on others, we use one 256M BAT. - */ +/* We use one BAT to map up to 256M of RAM at _PAGE_OFFSET */ initial_bats: lis r11,PAGE_OFFSET@h -#ifdef CONFIG_PPC_BOOK3S_601 - ori r11,r11,4 /* set up BAT registers for 601 */ - li r8,0x7f /* valid, block length = 8MB */ - mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ - mtspr SPRN_IBAT0L,r8 /* lower BAT register */ - addis r11,r11,0x800000@h - addis r8,r8,0x800000@h - mtspr SPRN_IBAT1U,r11 - mtspr SPRN_IBAT1L,r8 - addis r11,r11,0x800000@h - addis r8,r8,0x800000@h - mtspr SPRN_IBAT2U,r11 - mtspr SPRN_IBAT2L,r8 -#else tophys(r8,r11) #ifdef CONFIG_SMP ori r8,r8,0x12 /* R/W access, M=1 */ @@ -1295,11 +1230,10 @@ initial_bats: #endif /* CONFIG_SMP */ ori r11,r11,BL_256M<<2|0x2 /* set up BAT registers for 604 */ - mtspr SPRN_DBAT0L,r8 /* N.B. 6xx (not 601) have valid */ + mtspr SPRN_DBAT0L,r8 /* N.B. 6xx have valid */ mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ mtspr SPRN_IBAT0L,r8 mtspr SPRN_IBAT0U,r11 -#endif isync blr @@ -1317,13 +1251,8 @@ setup_disp_bat: beqlr lwz r11,0(r8) lwz r8,4(r8) -#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT3L,r8 mtspr SPRN_DBAT3U,r11 -#else - mtspr SPRN_IBAT3L,r8 - mtspr SPRN_IBAT3U,r11 -#endif blr #endif /* CONFIG_BOOTX_TEXT */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 18f87bf9e32b..71c359d438b5 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -176,7 +176,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) #endif mtspr SPRN_SRR1,r10 mtspr SPRN_SRR0,r11 - SYNC RFI /* jump to handler, enable MMU */ 99: b ret_from_kernel_syscall .endm diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 1f4a1efa0074..f4e8f21046f5 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -494,151 +494,6 @@ reset: } } -static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info) -{ - return ((info->address <= dar) && (dar - info->address < info->len)); -} - -static bool ea_user_range_overlaps(unsigned long ea, int size, - struct arch_hw_breakpoint *info) -{ - return ((ea < info->address + info->len) && - (ea + size > info->address)); -} - -static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info) -{ - unsigned long hw_start_addr, hw_end_addr; - - hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE); - hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE); - - return ((hw_start_addr <= dar) && (hw_end_addr > dar)); -} - -static bool ea_hw_range_overlaps(unsigned long ea, int size, - struct arch_hw_breakpoint *info) -{ - unsigned long hw_start_addr, hw_end_addr; - - hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE); - hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE); - - return ((ea < hw_end_addr) && (ea + size > hw_start_addr)); -} - -/* - * If hw has multiple DAWR registers, we also need to check all - * dawrx constraint bits to confirm this is _really_ a valid event. - * If type is UNKNOWN, but privilege level matches, consider it as - * a positive match. - */ -static bool check_dawrx_constraints(struct pt_regs *regs, int type, - struct arch_hw_breakpoint *info) -{ - if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ)) - return false; - - /* - * The Cache Management instructions other than dcbz never - * cause a match. i.e. if type is CACHEOP, the instruction - * is dcbz, and dcbz is treated as Store. - */ - if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE)) - return false; - - if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL)) - return false; - - if (user_mode(regs) && !(info->type & HW_BRK_TYPE_USER)) - return false; - - return true; -} - -/* - * Return true if the event is valid wrt dawr configuration, - * including extraneous exception. Otherwise return false. - */ -static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr, - unsigned long ea, int type, int size, - struct arch_hw_breakpoint *info) -{ - bool in_user_range = dar_in_user_range(regs->dar, info); - bool dawrx_constraints; - - /* - * 8xx supports only one breakpoint and thus we can - * unconditionally return true. - */ - if (IS_ENABLED(CONFIG_PPC_8xx)) { - if (!in_user_range) - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; - return true; - } - - if (unlikely(ppc_inst_equal(instr, ppc_inst(0)))) { - if (cpu_has_feature(CPU_FTR_ARCH_31) && - !dar_in_hw_range(regs->dar, info)) - return false; - - return true; - } - - dawrx_constraints = check_dawrx_constraints(regs, type, info); - - if (type == UNKNOWN) { - if (cpu_has_feature(CPU_FTR_ARCH_31) && - !dar_in_hw_range(regs->dar, info)) - return false; - - return dawrx_constraints; - } - - if (ea_user_range_overlaps(ea, size, info)) - return dawrx_constraints; - - if (ea_hw_range_overlaps(ea, size, info)) { - if (dawrx_constraints) { - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; - return true; - } - } - return false; -} - -static int cache_op_size(void) -{ -#ifdef __powerpc64__ - return ppc64_caches.l1d.block_size; -#else - return L1_CACHE_BYTES; -#endif -} - -static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, - int *type, int *size, unsigned long *ea) -{ - struct instruction_op op; - - if (__get_user_instr_inatomic(*instr, (void __user *)regs->nip)) - return; - - analyse_instr(&op, regs, *instr); - *type = GETTYPE(op.type); - *ea = op.ea; -#ifdef __powerpc64__ - if (!(regs->msr & MSR_64BIT)) - *ea &= 0xffffffffUL; -#endif - - *size = GETSIZE(op.type); - if (*type == CACHEOP) { - *size = cache_op_size(); - *ea &= ~(*size - 1); - } -} - static bool is_larx_stcx_instr(int type) { return type == LARX || type == STCX; @@ -722,7 +577,7 @@ int hw_breakpoint_handler(struct die_args *args) rcu_read_lock(); if (!IS_ENABLED(CONFIG_PPC_8xx)) - get_instr_detail(regs, &instr, &type, &size, &ea); + wp_get_instr_detail(regs, &instr, &type, &size, &ea); for (i = 0; i < nr_wp_slots(); i++) { bp[i] = __this_cpu_read(bp_per_reg[i]); @@ -732,7 +587,7 @@ int hw_breakpoint_handler(struct die_args *args) info[i] = counter_arch_bp(bp[i]); info[i]->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; - if (check_constraints(regs, instr, ea, type, size, info[i])) { + if (wp_check_constraints(regs, instr, ea, type, size, info[i])) { if (!IS_ENABLED(CONFIG_PPC_8xx) && ppc_inst_equal(instr, ppc_inst(0))) { handler_error(bp[i], info[i]); diff --git a/arch/powerpc/kernel/hw_breakpoint_constraints.c b/arch/powerpc/kernel/hw_breakpoint_constraints.c new file mode 100644 index 000000000000..867ee4aa026a --- /dev/null +++ b/arch/powerpc/kernel/hw_breakpoint_constraints.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <asm/hw_breakpoint.h> +#include <asm/sstep.h> +#include <asm/cache.h> + +static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info) +{ + return ((info->address <= dar) && (dar - info->address < info->len)); +} + +static bool ea_user_range_overlaps(unsigned long ea, int size, + struct arch_hw_breakpoint *info) +{ + return ((ea < info->address + info->len) && + (ea + size > info->address)); +} + +static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info) +{ + unsigned long hw_start_addr, hw_end_addr; + + hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE); + hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE); + + return ((hw_start_addr <= dar) && (hw_end_addr > dar)); +} + +static bool ea_hw_range_overlaps(unsigned long ea, int size, + struct arch_hw_breakpoint *info) +{ + unsigned long hw_start_addr, hw_end_addr; + unsigned long align_size = HW_BREAKPOINT_SIZE; + + /* + * On p10 predecessors, quadword is handle differently then + * other instructions. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && size == 16) + align_size = HW_BREAKPOINT_SIZE_QUADWORD; + + hw_start_addr = ALIGN_DOWN(info->address, align_size); + hw_end_addr = ALIGN(info->address + info->len, align_size); + + return ((ea < hw_end_addr) && (ea + size > hw_start_addr)); +} + +/* + * If hw has multiple DAWR registers, we also need to check all + * dawrx constraint bits to confirm this is _really_ a valid event. + * If type is UNKNOWN, but privilege level matches, consider it as + * a positive match. + */ +static bool check_dawrx_constraints(struct pt_regs *regs, int type, + struct arch_hw_breakpoint *info) +{ + if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ)) + return false; + + /* + * The Cache Management instructions other than dcbz never + * cause a match. i.e. if type is CACHEOP, the instruction + * is dcbz, and dcbz is treated as Store. + */ + if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE)) + return false; + + if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL)) + return false; + + if (user_mode(regs) && !(info->type & HW_BRK_TYPE_USER)) + return false; + + return true; +} + +/* + * Return true if the event is valid wrt dawr configuration, + * including extraneous exception. Otherwise return false. + */ +bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr, + unsigned long ea, int type, int size, + struct arch_hw_breakpoint *info) +{ + bool in_user_range = dar_in_user_range(regs->dar, info); + bool dawrx_constraints; + + /* + * 8xx supports only one breakpoint and thus we can + * unconditionally return true. + */ + if (IS_ENABLED(CONFIG_PPC_8xx)) { + if (!in_user_range) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + return true; + } + + if (unlikely(ppc_inst_equal(instr, ppc_inst(0)))) { + if (cpu_has_feature(CPU_FTR_ARCH_31) && + !dar_in_hw_range(regs->dar, info)) + return false; + + return true; + } + + dawrx_constraints = check_dawrx_constraints(regs, type, info); + + if (type == UNKNOWN) { + if (cpu_has_feature(CPU_FTR_ARCH_31) && + !dar_in_hw_range(regs->dar, info)) + return false; + + return dawrx_constraints; + } + + if (ea_user_range_overlaps(ea, size, info)) + return dawrx_constraints; + + if (ea_hw_range_overlaps(ea, size, info)) { + if (dawrx_constraints) { + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + return true; + } + } + return false; +} + +static int cache_op_size(void) +{ +#ifdef __powerpc64__ + return ppc64_caches.l1d.block_size; +#else + return L1_CACHE_BYTES; +#endif +} + +void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, + int *type, int *size, unsigned long *ea) +{ + struct instruction_op op; + + if (__get_user_instr_inatomic(*instr, (void __user *)regs->nip)) + return; + + analyse_instr(&op, regs, *instr); + *type = GETTYPE(op.type); + *ea = op.ea; +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) + *ea &= 0xffffffffUL; +#endif + + *size = GETSIZE(op.type); + if (*type == CACHEOP) { + *size = cache_op_size(); + *ea &= ~(*size - 1); + } else if (*type == LOAD_VMX || *type == STORE_VMX) { + *ea &= ~(*size - 1); + } +} diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 422e31d2f5a2..ae0e2632393d 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -41,14 +41,6 @@ static int __init powersave_off(char *arg) } __setup("powersave=off", powersave_off); -#ifdef CONFIG_HOTPLUG_CPU -void arch_cpu_idle_dead(void) -{ - sched_preempt_enable_no_resched(); - cpu_die(); -} -#endif - void arch_cpu_idle(void) { ppc64_runlatch_off(); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 9704f3f76e63..5b69a6a72a0e 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -172,7 +172,6 @@ static unsigned long iommu_range_alloc(struct device *dev, int largealloc = npages > 15; int pass = 0; unsigned long align_mask; - unsigned long boundary_size; unsigned long flags; unsigned int pool_nr; struct iommu_pool *pool; @@ -236,15 +235,9 @@ again: } } - if (dev) - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - 1 << tbl->it_page_shift); - else - boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift); - /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ - n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, - boundary_size >> tbl->it_page_shift, align_mask); + dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift), + align_mask); if (n == -1) { if (likely(pass == 0)) { /* First try the pool from the start */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index bf21ebd36190..7d0f7682d01d 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,7 +104,7 @@ static inline notrace unsigned long get_irq_happened(void) static inline notrace int decrementer_check_overflow(void) { - u64 now = get_tb_or_rtc(); + u64 now = get_tb(); u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); return now >= *next_tb; @@ -113,7 +113,7 @@ static inline notrace int decrementer_check_overflow(void) #ifdef CONFIG_PPC_BOOK3E /* This is called whenever we are re-enabling interrupts - * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if + * and returns either 0 (nothing to do) or 500/900/280 if * there's an EE, DEC or DBELL to generate. * * This is called in two contexts: From arch_local_irq_restore() @@ -181,16 +181,6 @@ notrace unsigned int __check_irq_replay(void) return 0x500; } - /* - * Check if an EPR external interrupt happened this bit is typically - * set if we need to handle another "edge" interrupt from within the - * MPIC "EPR" handler. - */ - if (happened & PACA_IRQ_EE_EDGE) { - local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; - return 0x500; - } - if (happened & PACA_IRQ_DBELL) { local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0x280; @@ -201,6 +191,25 @@ notrace unsigned int __check_irq_replay(void) return 0; } + +/* + * This is specifically called by assembly code to re-enable interrupts + * if they are currently disabled. This is typically called before + * schedule() or do_signal() when returning to userspace. We do it + * in C to avoid the burden of dealing with lockdep etc... + * + * NOTE: This is called with interrupts hard disabled but not marked + * as such in paca->irq_happened, so we need to resync this. + */ +void notrace restore_interrupts(void) +{ + if (irqs_disabled()) { + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + local_irq_enable(); + } else + __hard_irq_enable(); +} + #endif /* CONFIG_PPC_BOOK3E */ void replay_soft_interrupts(void) @@ -214,7 +223,7 @@ void replay_soft_interrupts(void) struct pt_regs regs; ppc_save_regs(®s); - regs.softe = IRQS_ALL_DISABLED; + regs.softe = IRQS_ENABLED; again: if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) @@ -270,19 +279,6 @@ again: hard_irq_disable(); } - /* - * Check if an EPR external interrupt happened this bit is typically - * set if we need to handle another "edge" interrupt from within the - * MPIC "EPR" handler. - */ - if (IS_ENABLED(CONFIG_PPC_BOOK3E) && (happened & PACA_IRQ_EE_EDGE)) { - local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; - regs.trap = 0x500; - do_IRQ(®s); - if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) - hard_irq_disable(); - } - if (IS_ENABLED(CONFIG_PPC_DOORBELL) && (happened & PACA_IRQ_DBELL)) { local_paca->irq_happened &= ~PACA_IRQ_DBELL; if (IS_ENABLED(CONFIG_PPC_BOOK3E)) @@ -368,6 +364,12 @@ notrace void arch_local_irq_restore(unsigned long mask) } } + /* + * Disable preempt here, so that the below preempt_enable will + * perform resched if required (a replayed interrupt may set + * need_resched). + */ + preempt_disable(); irq_soft_mask_set(IRQS_ALL_DISABLED); trace_hardirqs_off(); @@ -377,28 +379,11 @@ notrace void arch_local_irq_restore(unsigned long mask) trace_hardirqs_on(); irq_soft_mask_set(IRQS_ENABLED); __hard_irq_enable(); + preempt_enable(); } EXPORT_SYMBOL(arch_local_irq_restore); /* - * This is specifically called by assembly code to re-enable interrupts - * if they are currently disabled. This is typically called before - * schedule() or do_signal() when returning to userspace. We do it - * in C to avoid the burden of dealing with lockdep etc... - * - * NOTE: This is called with interrupts hard disabled but not marked - * as such in paca->irq_happened, so we need to resync this. - */ -void notrace restore_interrupts(void) -{ - if (irqs_disabled()) { - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - local_irq_enable(); - } else - __hard_irq_enable(); -} - -/* * This is a helper to use when about to go into idle low-power * when the latter has the side effect of re-enabling interrupts * (such as calling H_CEDE under pHyp). diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 5f07aa5e9851..225511d73bef 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -256,7 +256,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) sync /* Restore MSR (restores EE and DR bits to original state) */ - SYNC mtmsr r7 isync @@ -377,7 +376,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) 1: bdnz 1b /* Restore MSR (restores EE and DR bits to original state) */ -4: SYNC +4: mtmsr r7 isync blr diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b24f866fef81..717e658b90fd 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -215,19 +215,6 @@ _GLOBAL(low_choose_7447a_dfs) #endif /* CONFIG_CPU_FREQ_PMAC && CONFIG_PPC_BOOK3S_32 */ -/* - * complement mask on the msr then "or" some values on. - * _nmask_and_or_msr(nmask, value_to_or) - */ -_GLOBAL(_nmask_and_or_msr) - mfmsr r0 /* Get current msr */ - andc r0,r0,r3 /* And off the bits set in r3 (first parm) */ - or r0,r0,r4 /* Or on the bits in r4 (second parm) */ - SYNC /* Some chip revs have problems here... */ - mtmsr r0 /* Update machine state */ - isync - blr /* Done */ - #ifdef CONFIG_40x /* @@ -268,41 +255,6 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) #endif /* CONFIG_40x */ - -/* - * Flush instruction cache. - * This is a no-op on the 601. - */ -#ifndef CONFIG_PPC_8xx -_GLOBAL(flush_instruction_cache) -#if defined(CONFIG_4xx) - lis r3, KERNELBASE@h - iccci 0,r3 -#elif defined(CONFIG_FSL_BOOKE) -#ifdef CONFIG_E200 - mfspr r3,SPRN_L1CSR0 - ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC - /* msync; isync recommended here */ - mtspr SPRN_L1CSR0,r3 - isync - blr -#endif - mfspr r3,SPRN_L1CSR1 - ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR - mtspr SPRN_L1CSR1,r3 -#elif defined(CONFIG_PPC_BOOK3S_601) - blr /* for 601, do nothing */ -#else - /* 603/604 processor - use invalidate-all bit in HID0 */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_ICFI - mtspr SPRN_HID0,r3 -#endif /* CONFIG_4xx */ - isync - blr -EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx */ - /* * Copy a whole page. We use the dcbz instruction on the destination * to reduce memory traffic (it eliminates the unnecessary reads of diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 7bb46ad98207..070465825c21 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -365,7 +365,6 @@ _GLOBAL(kexec_smp_wait) li r4,KEXEC_STATE_REAL_MODE stb r4,PACAKEXECSTATE(r13) - SYNC b kexec_wait diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 73a57043ee66..d421a2c7f822 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -124,10 +124,8 @@ unsigned long notrace msr_check_and_set(unsigned long bits) newmsr = oldmsr | bits; -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP)) newmsr |= MSR_VSX; -#endif if (oldmsr != newmsr) mtmsr_isync(newmsr); @@ -144,10 +142,8 @@ void notrace __msr_check_and_clear(unsigned long bits) newmsr = oldmsr & ~bits; -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP)) newmsr &= ~MSR_VSX; -#endif if (oldmsr != newmsr) mtmsr_isync(newmsr); @@ -162,10 +158,8 @@ static void __giveup_fpu(struct task_struct *tsk) save_fpu(tsk); msr = tsk->thread.regs->msr; msr &= ~(MSR_FP|MSR_FE0|MSR_FE1); -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; -#endif tsk->thread.regs->msr = msr; } @@ -235,6 +229,8 @@ void enable_kernel_fp(void) } } EXPORT_SYMBOL(enable_kernel_fp); +#else +static inline void __giveup_fpu(struct task_struct *tsk) { } #endif /* CONFIG_PPC_FPU */ #ifdef CONFIG_ALTIVEC @@ -245,10 +241,8 @@ static void __giveup_altivec(struct task_struct *tsk) save_altivec(tsk); msr = tsk->thread.regs->msr; msr &= ~MSR_VEC; -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; -#endif tsk->thread.regs->msr = msr; } @@ -414,21 +408,14 @@ static unsigned long msr_all_available; static int __init init_msr_all_available(void) { -#ifdef CONFIG_PPC_FPU - msr_all_available |= MSR_FP; -#endif -#ifdef CONFIG_ALTIVEC + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr_all_available |= MSR_FP; if (cpu_has_feature(CPU_FTR_ALTIVEC)) msr_all_available |= MSR_VEC; -#endif -#ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr_all_available |= MSR_VSX; -#endif -#ifdef CONFIG_SPE if (cpu_has_feature(CPU_FTR_SPE)) msr_all_available |= MSR_SPE; -#endif return 0; } @@ -452,18 +439,12 @@ void giveup_all(struct task_struct *tsk) WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); -#ifdef CONFIG_PPC_FPU if (usermsr & MSR_FP) __giveup_fpu(tsk); -#endif -#ifdef CONFIG_ALTIVEC if (usermsr & MSR_VEC) __giveup_altivec(tsk); -#endif -#ifdef CONFIG_SPE if (usermsr & MSR_SPE) __giveup_spe(tsk); -#endif msr_check_and_clear(msr_all_available); } @@ -509,19 +490,18 @@ static bool should_restore_altivec(void) { return false; } static void do_restore_altivec(void) { } #endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_VSX static bool should_restore_vsx(void) { if (cpu_has_feature(CPU_FTR_VSX)) return true; return false; } +#ifdef CONFIG_VSX static void do_restore_vsx(void) { current->thread.used_vsr = 1; } #else -static bool should_restore_vsx(void) { return false; } static void do_restore_vsx(void) { } #endif /* CONFIG_VSX */ @@ -581,7 +561,7 @@ void notrace restore_math(struct pt_regs *regs) regs->msr |= new_msr | fpexc_mode; } } -#endif +#endif /* CONFIG_PPC_BOOK3S_64 */ static void save_all(struct task_struct *tsk) { @@ -642,6 +622,44 @@ void do_send_trap(struct pt_regs *regs, unsigned long address, (void __user *)address); } #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ + +static void do_break_handler(struct pt_regs *regs) +{ + struct arch_hw_breakpoint null_brk = {0}; + struct arch_hw_breakpoint *info; + struct ppc_inst instr = ppc_inst(0); + int type = 0; + int size = 0; + unsigned long ea; + int i; + + /* + * If underneath hw supports only one watchpoint, we know it + * caused exception. 8xx also falls into this category. + */ + if (nr_wp_slots() == 1) { + __set_breakpoint(0, &null_brk); + current->thread.hw_brk[0] = null_brk; + current->thread.hw_brk[0].flags |= HW_BRK_FLAG_DISABLED; + return; + } + + /* Otherwise findout which DAWR caused exception and disable it. */ + wp_get_instr_detail(regs, &instr, &type, &size, &ea); + + for (i = 0; i < nr_wp_slots(); i++) { + info = ¤t->thread.hw_brk[i]; + if (!info->address) + continue; + + if (wp_check_constraints(regs, instr, ea, type, size, info)) { + __set_breakpoint(i, &null_brk); + current->thread.hw_brk[i] = null_brk; + current->thread.hw_brk[i].flags |= HW_BRK_FLAG_DISABLED; + } + } +} + void do_break (struct pt_regs *regs, unsigned long address, unsigned long error_code) { @@ -653,6 +671,16 @@ void do_break (struct pt_regs *regs, unsigned long address, if (debugger_break_match(regs)) return; + /* + * We reach here only when watchpoint exception is generated by ptrace + * event (or hw is buggy!). Now if CONFIG_HAVE_HW_BREAKPOINT is set, + * watchpoint is already handled by hw_breakpoint_handler() so we don't + * have to do anything. But when CONFIG_HAVE_HW_BREAKPOINT is not set, + * we need to manually handle the watchpoint here. + */ + if (!IS_ENABLED(CONFIG_HAVE_HW_BREAKPOINT)) + do_break_handler(regs); + /* Deliver the signal to userspace */ force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address); } @@ -783,9 +811,8 @@ static void switch_hw_breakpoint(struct task_struct *new) static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) { mtspr(SPRN_DAC1, dabr); -#ifdef CONFIG_PPC_47x - isync(); -#endif + if (IS_ENABLED(CONFIG_PPC_47x)) + isync(); return 0; } #elif defined(CONFIG_PPC_BOOK3S) @@ -1256,15 +1283,17 @@ struct task_struct *__switch_to(struct task_struct *prev, restore_math(current->thread.regs); /* - * The copy-paste buffer can only store into foreign real - * addresses, so unprivileged processes can not see the - * data or use it in any way unless they have foreign real - * mappings. If the new process has the foreign real address - * mappings, we must issue a cp_abort to clear any state and - * prevent snooping, corruption or a covert channel. + * On POWER9 the copy-paste buffer can only paste into + * foreign real addresses, so unprivileged processes can not + * see the data or use it in any way unless they have + * foreign real mappings. If the new process has the foreign + * real address mappings, we must issue a cp_abort to clear + * any state and prevent snooping, corruption or a covert + * channel. ISA v3.1 supports paste into local memory. */ if (current->mm && - atomic_read(¤t->mm->context.vas_windows)) + (cpu_has_feature(CPU_FTR_ARCH_31) || + atomic_read(¤t->mm->context.vas_windows))) asm volatile(PPC_CP_ABORT); } #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -1453,12 +1482,13 @@ void show_regs(struct pt_regs * regs) trap = TRAP(regs); if (!trap_is_syscall(regs) && cpu_has_feature(CPU_FTR_CFAR)) pr_cont("CFAR: "REG" ", regs->orig_gpr3); - if (trap == 0x200 || trap == 0x300 || trap == 0x600) -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); -#else - pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); -#endif + if (trap == 0x200 || trap == 0x300 || trap == 0x600) { + if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); + else + pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); + } + #ifdef CONFIG_PPC64 pr_cont("IRQMASK: %lx ", regs->softe); #endif @@ -1475,14 +1505,14 @@ void show_regs(struct pt_regs * regs) break; } pr_cont("\n"); -#ifdef CONFIG_KALLSYMS /* * Lookup NIP late so we have the best change of getting the * above info out without failing */ - printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); - printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); -#endif + if (IS_ENABLED(CONFIG_KALLSYMS)) { + printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); + printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); + } show_stack(current, (unsigned long *) regs->gpr[1], KERN_DEFAULT); if (!user_mode(regs)) show_instructions(regs); @@ -1731,11 +1761,9 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ -#ifdef CONFIG_PPC_BOOK3S_64 - if (!radix_enabled()) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled()) preload_new_slb_context(start, sp); #endif -#endif /* * If we exec out of a kernel thread then thread.regs will not be @@ -1866,7 +1894,6 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) * fpexc_mode. fpexc_mode is also used for setting FP exception * mode (asyn, precise, disabled) for 'Classic' FP. */ if (val & PR_FP_EXC_SW_ENABLE) { -#ifdef CONFIG_SPE if (cpu_has_feature(CPU_FTR_SPE)) { /* * When the sticky exception bits are set @@ -1880,16 +1907,15 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) * anyway to restore the prctl settings from * the saved environment. */ +#ifdef CONFIG_SPE tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); tsk->thread.fpexc_mode = val & (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); +#endif return 0; } else { return -EINVAL; } -#else - return -EINVAL; -#endif } /* on a CONFIG_SPE this does not hurt us. The bits that @@ -1908,10 +1934,9 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) { - unsigned int val; + unsigned int val = 0; - if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) -#ifdef CONFIG_SPE + if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) { if (cpu_has_feature(CPU_FTR_SPE)) { /* * When the sticky exception bits are set @@ -1925,15 +1950,15 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) * anyway to restore the prctl settings from * the saved environment. */ +#ifdef CONFIG_SPE tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); val = tsk->thread.fpexc_mode; +#endif } else return -EINVAL; -#else - return -EINVAL; -#endif - else + } else { val = __unpack_fe01(tsk->thread.fpexc_mode); + } return put_user(val, (unsigned int __user *) adr); } @@ -2102,10 +2127,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack, unsigned long sp, ip, lr, newsp; int count = 0; int firstframe = 1; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned long ret_addr; int ftrace_idx = 0; -#endif if (tsk == NULL) tsk = current; @@ -2133,12 +2156,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack, if (!firstframe || ip != lr) { printk("%s["REG"] ["REG"] %pS", loglvl, sp, ip, (void *)ip); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER ret_addr = ftrace_graph_ret_addr(current, &ftrace_idx, ip, stack); if (ret_addr != ip) pr_cont(" (%pS)", (void *)ret_addr); -#endif if (firstframe) pr_cont(" (unreliable)"); pr_cont("\n"); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index d8a2fb87ba0c..c1545f22c077 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -776,6 +776,11 @@ void __init early_init_devtree(void *params) limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); memblock_enforce_memory_limit(limit); +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES) + if (!early_radix_enabled()) + memblock_cap_memory_range(0, 1UL << (H_MAX_PHYSMEM_BITS)); +#endif + memblock_allow_resize(); memblock_dump_all(); diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ae7ec9903191..5090a5ab54e5 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2422,10 +2422,19 @@ static void __init prom_check_displays(void) u32 width, height, pitch, addr; prom_printf("Setting btext !\n"); - prom_getprop(node, "width", &width, 4); - prom_getprop(node, "height", &height, 4); - prom_getprop(node, "linebytes", &pitch, 4); - prom_getprop(node, "address", &addr, 4); + + if (prom_getprop(node, "width", &width, 4) == PROM_ERROR) + return; + + if (prom_getprop(node, "height", &height, 4) == PROM_ERROR) + return; + + if (prom_getprop(node, "linebytes", &pitch, 4) == PROM_ERROR) + return; + + if (prom_getprop(node, "address", &addr, 4) == PROM_ERROR) + return; + prom_printf("W=%d H=%d LB=%d addr=0x%x\n", width, height, pitch, addr); btext_setup_display(width, height, 8, pitch, addr); diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c index 697c7e4b5877..aa36fcad36cd 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c +++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c @@ -57,6 +57,8 @@ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo) } else { dbginfo->features = 0; } + if (cpu_has_feature(CPU_FTR_ARCH_31)) + dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_ARCH_31; } int ptrace_get_debugreg(struct task_struct *child, unsigned long addr, @@ -217,8 +219,9 @@ long ppc_set_hwdebug(struct task_struct *child, struct ppc_hw_breakpoint *bp_inf return -EIO; brk.address = ALIGN_DOWN(bp_info->addr, HW_BREAKPOINT_SIZE); - brk.type = HW_BRK_TYPE_TRANSLATE; + brk.type = HW_BRK_TYPE_TRANSLATE | HW_BRK_TYPE_PRIV_ALL; brk.len = DABR_MAX_LEN; + brk.hw_len = DABR_MAX_LEN; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) brk.type |= HW_BRK_TYPE_READ; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) @@ -286,11 +289,13 @@ long ppc_del_hwdebug(struct task_struct *child, long data) } return ret; #else /* CONFIG_HAVE_HW_BREAKPOINT */ - if (child->thread.hw_brk[data - 1].address == 0) + if (!(child->thread.hw_brk[data - 1].flags & HW_BRK_FLAG_DISABLED) && + child->thread.hw_brk[data - 1].address == 0) return -ENOENT; child->thread.hw_brk[data - 1].address = 0; child->thread.hw_brk[data - 1].type = 0; + child->thread.hw_brk[data - 1].flags = 0; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ return 0; diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 806d554ce357..954f41676f69 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -992,6 +992,147 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, return NULL; } +#ifdef CONFIG_PPC_RTAS_FILTER + +/* + * The sys_rtas syscall, as originally designed, allows root to pass + * arbitrary physical addresses to RTAS calls. A number of RTAS calls + * can be abused to write to arbitrary memory and do other things that + * are potentially harmful to system integrity, and thus should only + * be used inside the kernel and not exposed to userspace. + * + * All known legitimate users of the sys_rtas syscall will only ever + * pass addresses that fall within the RMO buffer, and use a known + * subset of RTAS calls. + * + * Accordingly, we filter RTAS requests to check that the call is + * permitted, and that provided pointers fall within the RMO buffer. + * The rtas_filters list contains an entry for each permitted call, + * with the indexes of the parameters which are expected to contain + * addresses and sizes of buffers allocated inside the RMO buffer. + */ +struct rtas_filter { + const char *name; + int token; + /* Indexes into the args buffer, -1 if not used */ + int buf_idx1; + int size_idx1; + int buf_idx2; + int size_idx2; + + int fixed_size; +}; + +static struct rtas_filter rtas_filters[] __ro_after_init = { + { "ibm,activate-firmware", -1, -1, -1, -1, -1 }, + { "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */ + { "display-character", -1, -1, -1, -1, -1 }, + { "ibm,display-message", -1, 0, -1, -1, -1 }, + { "ibm,errinjct", -1, 2, -1, -1, -1, 1024 }, + { "ibm,close-errinjct", -1, -1, -1, -1, -1 }, + { "ibm,open-errinct", -1, -1, -1, -1, -1 }, + { "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 }, + { "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 }, + { "ibm,get-indices", -1, 2, 3, -1, -1 }, + { "get-power-level", -1, -1, -1, -1, -1 }, + { "get-sensor-state", -1, -1, -1, -1, -1 }, + { "ibm,get-system-parameter", -1, 1, 2, -1, -1 }, + { "get-time-of-day", -1, -1, -1, -1, -1 }, + { "ibm,get-vpd", -1, 0, -1, 1, 2 }, + { "ibm,lpar-perftools", -1, 2, 3, -1, -1 }, + { "ibm,platform-dump", -1, 4, 5, -1, -1 }, + { "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 }, + { "ibm,scan-log-dump", -1, 0, 1, -1, -1 }, + { "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 }, + { "ibm,set-eeh-option", -1, -1, -1, -1, -1 }, + { "set-indicator", -1, -1, -1, -1, -1 }, + { "set-power-level", -1, -1, -1, -1, -1 }, + { "set-time-for-power-on", -1, -1, -1, -1, -1 }, + { "ibm,set-system-parameter", -1, 1, -1, -1, -1 }, + { "set-time-of-day", -1, -1, -1, -1, -1 }, + { "ibm,suspend-me", -1, -1, -1, -1, -1 }, + { "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 }, + { "ibm,update-properties", -1, 0, -1, -1, -1, 4096 }, + { "ibm,physical-attestation", -1, 0, 1, -1, -1 }, +}; + +static bool in_rmo_buf(u32 base, u32 end) +{ + return base >= rtas_rmo_buf && + base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) && + base <= end && + end >= rtas_rmo_buf && + end < (rtas_rmo_buf + RTAS_RMOBUF_MAX); +} + +static bool block_rtas_call(int token, int nargs, + struct rtas_args *args) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) { + struct rtas_filter *f = &rtas_filters[i]; + u32 base, size, end; + + if (token != f->token) + continue; + + if (f->buf_idx1 != -1) { + base = be32_to_cpu(args->args[f->buf_idx1]); + if (f->size_idx1 != -1) + size = be32_to_cpu(args->args[f->size_idx1]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + + end = base + size - 1; + if (!in_rmo_buf(base, end)) + goto err; + } + + if (f->buf_idx2 != -1) { + base = be32_to_cpu(args->args[f->buf_idx2]); + if (f->size_idx2 != -1) + size = be32_to_cpu(args->args[f->size_idx2]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + end = base + size - 1; + + /* + * Special case for ibm,configure-connector where the + * address can be 0 + */ + if (!strcmp(f->name, "ibm,configure-connector") && + base == 0) + return false; + + if (!in_rmo_buf(base, end)) + goto err; + } + + return false; + } + +err: + pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n"); + pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n", + token, nargs, current->comm); + return true; +} + +#else + +static bool block_rtas_call(int token, int nargs, + struct rtas_args *args) +{ + return false; +} + +#endif /* CONFIG_PPC_RTAS_FILTER */ + /* We assume to be passed big endian arguments */ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) { @@ -1029,6 +1170,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) args.rets = &args.args[nargs]; memset(args.rets, 0, nret * sizeof(rtas_arg_t)); + if (block_rtas_call(token, nargs, &args)) + return -EINVAL; + /* Need to handle ibm,suspend_me call specially */ if (token == ibm_suspend_me_token) { @@ -1090,6 +1234,9 @@ void __init rtas_initialize(void) unsigned long rtas_region = RTAS_INSTANTIATE_MAX; u32 base, size, entry; int no_base, no_size, no_entry; +#ifdef CONFIG_PPC_RTAS_FILTER + int i; +#endif /* Get RTAS dev node and fill up our "rtas" structure with infos * about it. @@ -1129,6 +1276,12 @@ void __init rtas_initialize(void) #ifdef CONFIG_RTAS_ERROR_LOGGING rtas_last_error_token = rtas_token("rtas-last-error"); #endif + +#ifdef CONFIG_PPC_RTAS_FILTER + for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) { + rtas_filters[i].token = rtas_token(rtas_filters[i].name); + } +#endif } int __init early_init_dt_scan_rtas(unsigned long node, diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index c9876aab3142..e4e1a94ccf6a 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -430,30 +430,44 @@ device_initcall(stf_barrier_debugfs_init); static void update_branch_cache_flush(void) { + u32 *site; + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + site = &patch__call_kvm_flush_link_stack; // This controls the branch from guest_exit_cont to kvm_flush_link_stack if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) { - patch_instruction_site(&patch__call_kvm_flush_link_stack, - ppc_inst(PPC_INST_NOP)); + patch_instruction_site(site, ppc_inst(PPC_INST_NOP)); } else { // Could use HW flush, but that could also flush count cache - patch_branch_site(&patch__call_kvm_flush_link_stack, - (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); + patch_branch_site(site, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); } #endif + // Patch out the bcctr first, then nop the rest + site = &patch__call_flush_branch_caches3; + patch_instruction_site(site, ppc_inst(PPC_INST_NOP)); + site = &patch__call_flush_branch_caches2; + patch_instruction_site(site, ppc_inst(PPC_INST_NOP)); + site = &patch__call_flush_branch_caches1; + patch_instruction_site(site, ppc_inst(PPC_INST_NOP)); + // This controls the branch from _switch to flush_branch_caches if (count_cache_flush_type == BRANCH_CACHE_FLUSH_NONE && link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) { - patch_instruction_site(&patch__call_flush_branch_caches, - ppc_inst(PPC_INST_NOP)); + // Nothing to be done + } else if (count_cache_flush_type == BRANCH_CACHE_FLUSH_HW && link_stack_flush_type == BRANCH_CACHE_FLUSH_HW) { - patch_instruction_site(&patch__call_flush_branch_caches, - ppc_inst(PPC_INST_BCCTR_FLUSH)); + // Patch in the bcctr last + site = &patch__call_flush_branch_caches1; + patch_instruction_site(site, ppc_inst(0x39207fff)); // li r9,0x7fff + site = &patch__call_flush_branch_caches2; + patch_instruction_site(site, ppc_inst(0x7d2903a6)); // mtctr r9 + site = &patch__call_flush_branch_caches3; + patch_instruction_site(site, ppc_inst(PPC_INST_BCCTR_FLUSH)); + } else { - patch_branch_site(&patch__call_flush_branch_caches, - (u64)&flush_branch_caches, BRANCH_SET_LINK); + patch_branch_site(site, (u64)&flush_branch_caches, BRANCH_SET_LINK); // If we just need to flush the link stack, early return if (count_cache_flush_type == BRANCH_CACHE_FLUSH_NONE) { diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 1823706ae076..057d6b8e9bb0 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -223,6 +223,6 @@ __init void initialize_cache_info(void) dcache_bsize = cur_cpu_spec->dcache_bsize; icache_bsize = cur_cpu_spec->icache_bsize; ucache_bsize = 0; - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200)) + if (IS_ENABLED(CONFIG_E200)) ucache_bsize = icache_bsize = dcache_bsize; } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6be430107c6f..bb9cab3641d7 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -66,6 +66,7 @@ #include <asm/feature-fixups.h> #include <asm/kup.h> #include <asm/early_ioremap.h> +#include <asm/pgalloc.h> #include "setup.h" @@ -756,17 +757,46 @@ void __init emergency_stack_init(void) } #ifdef CONFIG_SMP -#define PCPU_DYN_SIZE () - -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, + size_t align) { - return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, - early_cpu_to_node(cpu)); + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + if (!node_online(node) || !NODE_DATA(node)) { + ptr = memblock_alloc_from(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = memblock_alloc_try_nid(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, node); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return memblock_alloc_from(size, align, goal); +#endif } -static void __init pcpu_fc_free(void *ptr, size_t size) +static void __init pcpu_free_bootmem(void *ptr, size_t size) { memblock_free(__pa(ptr), size); } @@ -782,13 +812,58 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static void __init pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); +} + + void __init setup_per_cpu_areas(void) { const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t atom_size; unsigned long delta; unsigned int cpu; - int rc; + int rc = -EINVAL; /* * Linear mapping is one of 4K, 1M and 16M. For 4K, no need @@ -800,8 +875,18 @@ void __init setup_per_cpu_areas(void) else atom_size = 1 << 20; - rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_alloc_bootmem, pcpu_free_bootmem); + if (rc) + pr_warn("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + + if (rc < 0) + rc = pcpu_page_first_chunk(0, pcpu_alloc_bootmem, pcpu_free_bootmem, + pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index d15a98c758b8..d2c356f37077 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -312,9 +312,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) { user_exit(); - /* Check valid addr_limit, TIF check is done there */ - addr_limit_user_check(); - if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); @@ -327,7 +324,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) } if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8261999c7d52..0dc1b8591cc8 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -75,17 +75,28 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; struct task_struct *secondary_current; bool has_big_cores; +bool coregroup_enabled; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); +enum { +#ifdef CONFIG_SCHED_SMT + smt_idx, +#endif + cache_idx, + mc_idx, + die_idx, +}; + #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 struct thread_groups { @@ -660,6 +671,28 @@ static void set_cpus_unrelated(int i, int j, #endif /* + * Extends set_cpus_related. Instead of setting one CPU at a time in + * dstmask, set srcmask at oneshot. dstmask should be super set of srcmask. + */ +static void or_cpumasks_related(int i, int j, struct cpumask *(*srcmask)(int), + struct cpumask *(*dstmask)(int)) +{ + struct cpumask *mask; + int k; + + mask = srcmask(j); + for_each_cpu(k, srcmask(i)) + cpumask_or(dstmask(k), dstmask(k), mask); + + if (i == j) + return; + + mask = srcmask(i); + for_each_cpu(k, srcmask(j)) + cpumask_or(dstmask(k), dstmask(k), mask); +} + +/* * parse_thread_groups: Parses the "ibm,thread-groups" device tree * property for the CPU device node @dn and stores * the parsed output in the thread_groups @@ -789,10 +822,6 @@ static int init_cpu_l1_cache_map(int cpu) if (err) goto out; - zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), - GFP_KERNEL, - cpu_to_node(cpu)); - cpu_group_start = get_cpu_thread_group_start(cpu, &tg); if (unlikely(cpu_group_start == -1)) { @@ -801,6 +830,9 @@ static int init_cpu_l1_cache_map(int cpu) goto out; } + zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + for (i = first_thread; i < first_thread + threads_per_core; i++) { int i_group_start = get_cpu_thread_group_start(i, &tg); @@ -819,6 +851,74 @@ out: return err; } +static bool shared_caches; + +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymmetric SMT dependency */ +static int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +/* + * P9 has a slightly odd architecture where pairs of cores share an L2 cache. + * This topology makes it *much* cheaper to migrate tasks between adjacent cores + * since the migrated task remains cache hot. We want to take advantage of this + * at the scheduler level so an extra topology level is required. + */ +static int powerpc_shared_cache_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} + +/* + * We can't just pass cpu_l2_cache_mask() directly because + * returns a non-const pointer and the compiler barfs on that. + */ +static const struct cpumask *shared_cache_mask(int cpu) +{ + return per_cpu(cpu_l2_cache_map, cpu); +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *smallcore_smt_mask(int cpu) +{ + return cpu_smallcore_mask(cpu); +} +#endif + +static struct cpumask *cpu_coregroup_mask(int cpu) +{ + return per_cpu(cpu_coregroup_map, cpu); +} + +static bool has_coregroup_support(void) +{ + return coregroup_enabled; +} + +static const struct cpumask *cpu_mc_mask(int cpu) +{ + return cpu_coregroup_mask(cpu); +} + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_mc_mask, SD_INIT_NAME(MC) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static int init_big_cores(void) { int cpu; @@ -861,6 +961,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + if (has_coregroup_support()) + zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + +#ifdef CONFIG_NEED_MULTIPLE_NODES /* * numa_node_id() works after this. */ @@ -869,12 +974,21 @@ void __init smp_prepare_cpus(unsigned int max_cpus) set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu])); } +#endif + /* + * cpu_core_map is now more updated and exists only since + * its been exported for long. It only will have a snapshot + * of cpu_cpu_mask. + */ + cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu)); } /* Init the cpumasks so the boot CPU is related to itself */ cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); - cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + + if (has_coregroup_support()) + cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid)); init_big_cores(); if (has_big_cores) { @@ -1126,30 +1240,61 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) +static bool update_mask_by_l2(int cpu) { + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask; struct device_node *l2_cache, *np; + cpumask_var_t mask; int i; l2_cache = cpu_to_l2cache(cpu); - if (!l2_cache) + if (!l2_cache) { + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; + + /* + * If no l2cache for this CPU, assume all siblings to share + * cache with this CPU. + */ + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; + + for_each_cpu(i, sibling_mask(cpu)) + set_cpus_related(cpu, i, cpu_l2_cache_mask); + return false; + } + + alloc_cpumask_var_node(&mask, GFP_KERNEL, cpu_to_node(cpu)); + cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu)); - for_each_cpu(i, cpu_online_mask) { + if (has_big_cores) + submask_fn = cpu_smallcore_mask; + + /* Update l2-cache mask with all the CPUs that are part of submask */ + or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); + + /* Skip all CPUs already part of current CPU l2-cache mask */ + cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu)); + + for_each_cpu(i, mask) { /* * when updating the marks the current CPU has not been marked * online, but we need to update the cache masks */ np = cpu_to_l2cache(i); - if (!np) - continue; - if (np == l2_cache) - set_cpus_related(cpu, i, mask_fn); + /* Skip all CPUs already part of current CPU l2-cache */ + if (np == l2_cache) { + or_cpumasks_related(cpu, i, submask_fn, cpu_l2_cache_mask); + cpumask_andnot(mask, mask, submask_fn(i)); + } else { + cpumask_andnot(mask, mask, cpu_l2_cache_mask(i)); + } of_node_put(np); } of_node_put(l2_cache); + free_cpumask_var(mask); return true; } @@ -1157,59 +1302,75 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) #ifdef CONFIG_HOTPLUG_CPU static void remove_cpu_from_masks(int cpu) { + struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; - /* NB: cpu_core_mask is a superset of the others */ - for_each_cpu(i, cpu_core_mask(cpu)) { - set_cpus_unrelated(cpu, i, cpu_core_mask); + if (shared_caches) + mask_fn = cpu_l2_cache_mask; + + for_each_cpu(i, mask_fn(cpu)) { set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); set_cpus_unrelated(cpu, i, cpu_sibling_mask); if (has_big_cores) set_cpus_unrelated(cpu, i, cpu_smallcore_mask); } + + if (has_coregroup_support()) { + for_each_cpu(i, cpu_coregroup_mask(cpu)) + set_cpus_unrelated(cpu, i, cpu_coregroup_mask); + } } #endif static inline void add_cpu_to_smallcore_masks(int cpu) { - struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu); - int i, first_thread = cpu_first_thread_sibling(cpu); + int i; if (!has_big_cores) return; cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu)); - for (i = first_thread; i < first_thread + threads_per_core; i++) { - if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map)) + for_each_cpu(i, per_cpu(cpu_l1_cache_map, cpu)) { + if (cpu_online(i)) set_cpus_related(i, cpu, cpu_smallcore_mask); } } -int get_physical_package_id(int cpu) +static void update_coregroup_mask(int cpu) { - int pkg_id = cpu_to_chip_id(cpu); + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask; + cpumask_var_t mask; + int coregroup_id = cpu_to_coregroup_id(cpu); + int i; - /* - * If the platform is PowerNV or Guest on KVM, ibm,chip-id is - * defined. Hence we would return the chip-id as the result of - * get_physical_package_id. - */ - if (pkg_id == -1 && firmware_has_feature(FW_FEATURE_LPAR) && - IS_ENABLED(CONFIG_PPC_SPLPAR)) { - struct device_node *np = of_get_cpu_node(cpu, NULL); - pkg_id = of_node_to_nid(np); - of_node_put(np); - } + alloc_cpumask_var_node(&mask, GFP_KERNEL, cpu_to_node(cpu)); + cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu)); + + if (shared_caches) + submask_fn = cpu_l2_cache_mask; - return pkg_id; + /* Update coregroup mask with all the CPUs that are part of submask */ + or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); + + /* Skip all CPUs already part of coregroup mask */ + cpumask_andnot(mask, mask, cpu_coregroup_mask(cpu)); + + for_each_cpu(i, mask) { + /* Skip all CPUs not part of this coregroup */ + if (coregroup_id == cpu_to_coregroup_id(i)) { + or_cpumasks_related(cpu, i, submask_fn, cpu_coregroup_mask); + cpumask_andnot(mask, mask, submask_fn(i)); + } else { + cpumask_andnot(mask, mask, cpu_coregroup_mask(i)); + } + } + free_cpumask_var(mask); } -EXPORT_SYMBOL_GPL(get_physical_package_id); static void add_cpu_to_masks(int cpu) { int first_thread = cpu_first_thread_sibling(cpu); - int pkg_id = get_physical_package_id(cpu); int i; /* @@ -1223,36 +1384,16 @@ static void add_cpu_to_masks(int cpu) set_cpus_related(i, cpu, cpu_sibling_mask); add_cpu_to_smallcore_masks(cpu); - /* - * Copy the thread sibling mask into the cache sibling mask - * and mark any CPUs that share an L2 with this CPU. - */ - for_each_cpu(i, cpu_sibling_mask(cpu)) - set_cpus_related(cpu, i, cpu_l2_cache_mask); - update_mask_by_l2(cpu, cpu_l2_cache_mask); + update_mask_by_l2(cpu); - /* - * Copy the cache sibling mask into core sibling mask and mark - * any CPUs on the same chip as this CPU. - */ - for_each_cpu(i, cpu_l2_cache_mask(cpu)) - set_cpus_related(cpu, i, cpu_core_mask); - - if (pkg_id == -1) - return; - - for_each_cpu(i, cpu_online_mask) - if (get_physical_package_id(i) == pkg_id) - set_cpus_related(cpu, i, cpu_core_mask); + if (has_coregroup_support()) + update_coregroup_mask(cpu); } -static bool shared_caches; - /* Activate a secondary processor. */ void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -1278,14 +1419,20 @@ void start_secondary(void *unused) /* Update topology CPU masks */ add_cpu_to_masks(cpu); - if (has_big_cores) - sibling_mask = cpu_smallcore_mask; /* * Check for any shared caches. Note that this must be done on a * per-core basis because one core in the pair might be disabled. */ - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu))) - shared_caches = true; + if (!shared_caches) { + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; + struct cpumask *mask = cpu_l2_cache_mask(cpu); + + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; + + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu))) + shared_caches = true; + } set_numa_node(numa_cpu_lookup_table[cpu]); set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); @@ -1311,63 +1458,44 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } -#ifdef CONFIG_SCHED_SMT -/* cpumask of CPUs with asymetric SMT dependancy */ -static int powerpc_smt_flags(void) +static void fixup_topology(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int i; - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - flags |= SD_ASYM_PACKING; +#ifdef CONFIG_SCHED_SMT + if (has_big_cores) { + pr_info("Big cores detected but using small core scheduling\n"); + powerpc_topology[smt_idx].mask = smallcore_smt_mask; } - return flags; -} #endif -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; + if (!has_coregroup_support()) + powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask; -/* - * P9 has a slightly odd architecture where pairs of cores share an L2 cache. - * This topology makes it *much* cheaper to migrate tasks between adjacent cores - * since the migrated task remains cache hot. We want to take advantage of this - * at the scheduler level so an extra topology level is required. - */ -static int powerpc_shared_cache_flags(void) -{ - return SD_SHARE_PKG_RESOURCES; -} + /* + * Try to consolidate topology levels here instead of + * allowing scheduler to degenerate. + * - Dont consolidate if masks are different. + * - Dont consolidate if sd_flags exists and are different. + */ + for (i = 1; i <= die_idx; i++) { + if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask) + continue; -/* - * We can't just pass cpu_l2_cache_mask() directly because - * returns a non-const pointer and the compiler barfs on that. - */ -static const struct cpumask *shared_cache_mask(int cpu) -{ - return cpu_l2_cache_mask(cpu); -} + if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags && + powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags) + continue; -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) -{ - return cpu_smallcore_mask(cpu); -} -#endif + if (!powerpc_topology[i - 1].sd_flags) + powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags; -static struct sched_domain_topology_level power9_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, + powerpc_topology[i].mask = powerpc_topology[i + 1].mask; + powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags; +#ifdef CONFIG_SCHED_DEBUG + powerpc_topology[i].name = powerpc_topology[i + 1].name; #endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; + } +} void __init smp_cpus_done(unsigned int max_cpus) { @@ -1382,24 +1510,8 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -#ifdef CONFIG_SCHED_SMT - if (has_big_cores) { - pr_info("Big cores detected but using small core scheduling\n"); - power9_topology[0].mask = smallcore_smt_mask; - powerpc_topology[0].mask = smallcore_smt_mask; - } -#endif - /* - * If any CPU detects that it's sharing a cache with another CPU then - * use the deeper topology that is aware of this sharing. - */ - if (shared_caches) { - pr_info("Using shared cache scheduler topology\n"); - set_sched_topology(power9_topology); - } else { - pr_info("Using standard scheduler topology\n"); - set_sched_topology(powerpc_topology); - } + fixup_topology(); + set_sched_topology(powerpc_topology); } #ifdef CONFIG_HOTPLUG_CPU @@ -1429,16 +1541,18 @@ void __cpu_die(unsigned int cpu) smp_ops->cpu_die(cpu); } -void cpu_die(void) +void arch_cpu_idle_dead(void) { + sched_preempt_enable_no_resched(); + /* * Disable on the down path. This will be re-enabled by * start_secondary() via start_secondary_resume() below */ this_cpu_disable_ftrace(); - if (ppc_md.cpu_die) - ppc_md.cpu_die(); + if (smp_ops->cpu_offline_self) + smp_ops->cpu_offline_self(); /* If we return, we re-enter start_secondary */ start_secondary_resume(); diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 9d7fb4ced290..1275daec7fec 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 46b4ebc33db7..2e08640bb3b4 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -32,29 +32,27 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices); -/* - * SMT snooze delay stuff, 64-bit only for now - */ - #ifdef CONFIG_PPC64 -/* Time in microseconds we delay before sleeping in the idle loop */ -static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 }; +/* + * Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle: + * smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in + * 2014: + * + * "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean + * up the kernel code." + * + * powerpc-utils stopped using it as of 1.3.8. At some point in the future this + * code should be removed. + */ static ssize_t store_smt_snooze_delay(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct cpu *cpu = container_of(dev, struct cpu, dev); - ssize_t ret; - long snooze; - - ret = sscanf(buf, "%ld", &snooze); - if (ret != 1) - return -EINVAL; - - per_cpu(smt_snooze_delay, cpu->dev.id) = snooze; + pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n", + current->comm, current->pid); return count; } @@ -62,9 +60,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev, struct device_attribute *attr, char *buf) { - struct cpu *cpu = container_of(dev, struct cpu, dev); - - return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id)); + pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n", + current->comm, current->pid); + return sprintf(buf, "100\n"); } static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, @@ -72,16 +70,10 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, static int __init setup_smt_snooze_delay(char *str) { - unsigned int cpu; - long snooze; - if (!cpu_has_feature(CPU_FTR_SMT)) return 1; - snooze = simple_strtol(str, NULL, 10); - for_each_possible_cpu(cpu) - per_cpu(smt_snooze_delay, cpu) = snooze; - + pr_warn("smt-snooze-delay command line option has no effect\n"); return 1; } __setup("smt-snooze-delay=", setup_smt_snooze_delay); @@ -225,14 +217,13 @@ static DEVICE_ATTR(dscr_default, 0600, static void sysfs_create_dscr_default(void) { if (cpu_has_feature(CPU_FTR_DSCR)) { - int err = 0; int cpu; dscr_default = spr_default_dscr; for_each_possible_cpu(cpu) paca_ptrs[cpu]->dscr_default = dscr_default; - err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); + device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); } } #endif /* CONFIG_PPC64 */ @@ -1168,6 +1159,7 @@ static int __init topology_init(void) for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); +#ifdef CONFIG_HOTPLUG_CPU /* * For now, we just see if the system supports making * the RTAS calls for CPU hotplug. But, there may be a @@ -1175,8 +1167,9 @@ static int __init topology_init(void) * CPU. For instance, the boot cpu might never be valid * for hotplugging. */ - if (ppc_md.cpu_die) + if (smp_ops->cpu_offline_self) c->hotpluggable = 1; +#endif if (cpu_online(cpu) || c->hotpluggable) { register_cpu(c, cpu); diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index e2ab8a111b69..0b4694b8d248 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -13,13 +13,14 @@ */ #include <linux/errno.h> -#include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/param.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/delay.h> +#include <linux/workqueue.h> #include <asm/io.h> #include <asm/reg.h> @@ -39,9 +40,7 @@ static struct tau_temp unsigned char grew; } tau[NR_CPUS]; -struct timer_list tau_timer; - -#undef DEBUG +static bool tau_int_enable; /* TODO: put these in a /proc interface, with some sanity checks, and maybe * dynamic adjustment to minimize # of interrupts */ @@ -50,72 +49,49 @@ struct timer_list tau_timer; #define step_size 2 /* step size when temp goes out of range */ #define window_expand 1 /* expand the window by this much */ /* configurable values for shrinking the window */ -#define shrink_timer 2*HZ /* period between shrinking the window */ +#define shrink_timer 2000 /* period between shrinking the window */ #define min_window 2 /* minimum window size, degrees C */ static void set_thresholds(unsigned long cpu) { -#ifdef CONFIG_TAU_INT - /* - * setup THRM1, - * threshold, valid bit, enable interrupts, interrupt when below threshold - */ - mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID); + u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0; - /* setup THRM2, - * threshold, valid bit, enable interrupts, interrupt when above threshold - */ - mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE); -#else - /* same thing but don't enable interrupts */ - mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID); - mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V); -#endif + /* setup THRM1, threshold, valid bit, interrupt when below threshold */ + mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID); + + /* setup THRM2, threshold, valid bit, interrupt when above threshold */ + mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie); } static void TAUupdate(int cpu) { - unsigned thrm; - -#ifdef DEBUG - printk("TAUupdate "); -#endif + u32 thrm; + u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V; /* if both thresholds are crossed, the step_sizes cancel out * and the window winds up getting expanded twice. */ - if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */ - if(thrm & THRM1_TIN){ /* crossed low threshold */ - if (tau[cpu].low >= step_size){ - tau[cpu].low -= step_size; - tau[cpu].high -= (step_size - window_expand); - } - tau[cpu].grew = 1; -#ifdef DEBUG - printk("low threshold crossed "); -#endif + thrm = mfspr(SPRN_THRM1); + if ((thrm & bits) == bits) { + mtspr(SPRN_THRM1, 0); + + if (tau[cpu].low >= step_size) { + tau[cpu].low -= step_size; + tau[cpu].high -= (step_size - window_expand); } + tau[cpu].grew = 1; + pr_debug("%s: low threshold crossed\n", __func__); } - if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */ - if(thrm & THRM1_TIN){ /* crossed high threshold */ - if (tau[cpu].high <= 127-step_size){ - tau[cpu].low += (step_size - window_expand); - tau[cpu].high += step_size; - } - tau[cpu].grew = 1; -#ifdef DEBUG - printk("high threshold crossed "); -#endif + thrm = mfspr(SPRN_THRM2); + if ((thrm & bits) == bits) { + mtspr(SPRN_THRM2, 0); + + if (tau[cpu].high <= 127 - step_size) { + tau[cpu].low += (step_size - window_expand); + tau[cpu].high += step_size; } + tau[cpu].grew = 1; + pr_debug("%s: high threshold crossed\n", __func__); } - -#ifdef DEBUG - printk("grew = %d\n", tau[cpu].grew); -#endif - -#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */ - set_thresholds(cpu); -#endif - } #ifdef CONFIG_TAU_INT @@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs) static void tau_timeout(void * info) { int cpu; - unsigned long flags; int size; int shrink; - /* disabling interrupts *should* be okay */ - local_irq_save(flags); cpu = smp_processor_id(); -#ifndef CONFIG_TAU_INT - TAUupdate(cpu); -#endif + if (!tau_int_enable) + TAUupdate(cpu); + + /* Stop thermal sensor comparisons and interrupts */ + mtspr(SPRN_THRM3, 0); size = tau[cpu].high - tau[cpu].low; if (size > min_window && ! tau[cpu].grew) { @@ -173,32 +148,26 @@ static void tau_timeout(void * info) set_thresholds(cpu); - /* - * Do the enable every time, since otherwise a bunch of (relatively) - * complex sleep code needs to be added. One mtspr every time - * tau_timeout is called is probably not a big deal. - * - * Enable thermal sensor and set up sample interval timer - * need 20 us to do the compare.. until a nice 'cpu_speed' function - * call is implemented, just assume a 500 mhz clock. It doesn't really - * matter if we take too long for a compare since it's all interrupt - * driven anyway. - * - * use a extra long time.. (60 us @ 500 mhz) + /* Restart thermal sensor comparisons and interrupts. + * The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet" + * recommends that "the maximum value be set in THRM3 under all + * conditions." */ - mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E); - - local_irq_restore(flags); + mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E); } -static void tau_timeout_smp(struct timer_list *unused) -{ +static struct workqueue_struct *tau_workq; - /* schedule ourselves to be run again */ - mod_timer(&tau_timer, jiffies + shrink_timer) ; +static void tau_work_func(struct work_struct *work) +{ + msleep(shrink_timer); on_each_cpu(tau_timeout, NULL, 0); + /* schedule ourselves to be run again */ + queue_work(tau_workq, work); } +DECLARE_WORK(tau_work, tau_work_func); + /* * setup the TAU * @@ -231,21 +200,19 @@ static int __init TAU_init(void) return 1; } + tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) && + !strcmp(cur_cpu_spec->platform, "ppc750"); - /* first, set up the window shrinking timer */ - timer_setup(&tau_timer, tau_timeout_smp, 0); - tau_timer.expires = jiffies + shrink_timer; - add_timer(&tau_timer); + tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0); + if (!tau_workq) + return -ENOMEM; on_each_cpu(TAU_init_smp, NULL, 0); - printk("Thermal assist unit "); -#ifdef CONFIG_TAU_INT - printk("using interrupts, "); -#else - printk("using timers, "); -#endif - printk("shrink_timer: %d jiffies\n", shrink_timer); + queue_work(tau_workq, &tau_work); + + pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n", + tau_int_enable ? "interrupts" : "workqueue", shrink_timer); tau_initialized = 1; return 0; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f85539ebb513..74efe46f5532 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -75,15 +75,6 @@ #include <linux/clockchips.h> #include <linux/timekeeper_internal.h> -static u64 rtc_read(struct clocksource *); -static struct clocksource clocksource_rtc = { - .name = "rtc", - .rating = 400, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .mask = CLOCKSOURCE_MASK(64), - .read = rtc_read, -}; - static u64 timebase_read(struct clocksource *); static struct clocksource clocksource_timebase = { .name = "timebase", @@ -447,19 +438,9 @@ void vtime_flush(struct task_struct *tsk) void __delay(unsigned long loops) { unsigned long start; - int diff; spin_begin(); - if (__USE_RTC()) { - start = get_rtcl(); - do { - /* the RTCL register wraps at 1000000000 */ - diff = get_rtcl() - start; - if (diff < 0) - diff += 1000000000; - spin_cpu_relax(); - } while (diff < loops); - } else if (tb_invalid) { + if (tb_invalid) { /* * TB is in error state and isn't ticking anymore. * HMI handler was unable to recover from TB error. @@ -467,8 +448,8 @@ void __delay(unsigned long loops) */ spin_cpu_relax(); } else { - start = get_tbl(); - while (get_tbl() - start < loops) + start = mftb(); + while (mftb() - start < loops) spin_cpu_relax(); } spin_end(); @@ -614,7 +595,7 @@ void timer_interrupt(struct pt_regs *regs) irq_work_run(); } - now = get_tb_or_rtc(); + now = get_tb(); if (now >= *next_tb) { *next_tb = ~(u64)0; if (evt->event_handler) @@ -696,8 +677,6 @@ EXPORT_SYMBOL_GPL(tb_to_ns); */ notrace unsigned long long sched_clock(void) { - if (__USE_RTC()) - return get_rtc(); return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } @@ -847,11 +826,6 @@ void read_persistent_clock64(struct timespec64 *ts) } /* clocksource code */ -static notrace u64 rtc_read(struct clocksource *cs) -{ - return (u64)get_rtc(); -} - static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); @@ -948,12 +922,7 @@ void update_vsyscall_tz(void) static void __init clocksource_init(void) { - struct clocksource *clock; - - if (__USE_RTC()) - clock = &clocksource_rtc; - else - clock = &clocksource_timebase; + struct clocksource *clock = &clocksource_timebase; if (clocksource_register_hz(clock, tb_ticks_per_sec)) { printk(KERN_ERR "clocksource: %s is already registered\n", @@ -968,7 +937,7 @@ static void __init clocksource_init(void) static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { - __this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt); + __this_cpu_write(decrementers_next_tb, get_tb() + evt); set_dec(evt); /* We may have raced with new irq work */ @@ -1071,17 +1040,12 @@ void __init time_init(void) u64 scale; unsigned shift; - if (__USE_RTC()) { - /* 601 processor: dec counts down by 128 every 128ns */ - ppc_tb_freq = 1000000000; - } else { - /* Normal PowerPC with timebase register */ - ppc_md.calibrate_decr(); - printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", - ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); - printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", - ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); - } + /* Normal PowerPC with timebase register */ + ppc_md.calibrate_decr(); + printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", + ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); + printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", + ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; @@ -1107,7 +1071,7 @@ void __init time_init(void) tb_to_ns_scale = scale; tb_to_ns_shift = shift; /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ - boot_tb = get_tb_or_rtc(); + boot_tb = get_tb(); /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 6ba0fdd1e7f8..2b91f233b05d 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -122,6 +122,13 @@ _GLOBAL(tm_reclaim) std r3, STK_PARAM(R3)(r1) SAVE_NVGPRS(r1) + /* + * Save kernel live AMR since it will be clobbered by treclaim + * but can be used elsewhere later in kernel space. + */ + mfspr r3, SPRN_AMR + std r3, TM_FRAME_L1(r1) + /* We need to setup MSR for VSX register save instructions. */ mfmsr r14 mr r15, r14 @@ -245,7 +252,7 @@ _GLOBAL(tm_reclaim) * but is used in signal return to 'wind back' to the abort handler. */ - /* ******************** CR,LR,CCR,MSR ********** */ + /* ***************** CTR, LR, CR, XER ********** */ mfctr r3 mflr r4 mfcr r5 @@ -256,7 +263,6 @@ _GLOBAL(tm_reclaim) std r5, _CCR(r7) std r6, _XER(r7) - /* ******************** TAR, DSCR ********** */ mfspr r3, SPRN_TAR mfspr r4, SPRN_DSCR @@ -264,6 +270,10 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TAR(r12) std r4, THREAD_TM_DSCR(r12) + /* ******************** AMR **************** */ + mfspr r3, SPRN_AMR + std r3, THREAD_TM_AMR(r12) + /* * MSR and flags: We don't change CRs, and we don't need to alter MSR. */ @@ -308,7 +318,9 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TFHAR(r12) std r4, THREAD_TM_TFIAR(r12) - /* AMR is checkpointed too, but is unsupported by Linux. */ + /* Restore kernel live AMR */ + ld r8, TM_FRAME_L1(r1) + mtspr SPRN_AMR, r8 /* Restore original MSR/IRQ state & clear TM mode */ ld r14, TM_FRAME_L0(r1) /* Orig MSR */ @@ -355,6 +367,13 @@ _GLOBAL(__tm_recheckpoint) */ SAVE_NVGPRS(r1) + /* + * Save kernel live AMR since it will be clobbered for trechkpt + * but can be used elsewhere later in kernel space. + */ + mfspr r8, SPRN_AMR + std r8, TM_FRAME_L0(r1) + /* Load complete register state from ts_ckpt* registers */ addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ @@ -404,7 +423,7 @@ _GLOBAL(__tm_recheckpoint) restore_gprs: - /* ******************** CR,LR,CCR,MSR ********** */ + /* ****************** CTR, LR, XER ************* */ ld r4, _CTR(r7) ld r5, _LINK(r7) ld r8, _XER(r7) @@ -417,6 +436,10 @@ restore_gprs: ld r4, THREAD_TM_TAR(r3) mtspr SPRN_TAR, r4 + /* ******************** AMR ******************** */ + ld r4, THREAD_TM_AMR(r3) + mtspr SPRN_AMR, r4 + /* Load up the PPR and DSCR in GPRs only at this stage */ ld r5, THREAD_TM_DSCR(r3) ld r6, THREAD_TM_PPR(r3) @@ -509,6 +532,10 @@ restore_gprs: li r4, MSR_RI mtmsrd r4, 1 + /* Restore kernel live AMR */ + ld r8, TM_FRAME_L0(r1) + mtspr SPRN_AMR, r8 + REST_NVGPRS(r1) addi r1, r1, TM_FRAME_SIZE diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d1ebe152f210..c5f39f13e96e 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -529,9 +529,6 @@ out: * Check if the NIP corresponds to the address of a sync * instruction for which there is an entry in the exception * table. - * Note that the 601 only takes a machine check on TEA - * (transfer error ack) signal assertion, and does not - * set any of the top 16 bits of SRR1. * -- paulus. */ static inline int check_io_access(struct pt_regs *regs) @@ -796,7 +793,6 @@ int machine_check_generic(struct pt_regs *regs) case 0x80000: pr_cont("Machine check signal\n"); break; - case 0: /* for 601 */ case 0x40000: case 0x140000: /* 7450 MSS error and TEA */ pr_cont("Transfer error ack signal\n"); diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index e147bbdc12cd..73eada6bc8cd 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -29,7 +29,7 @@ ccflags-y := -shared -fno-common -fno-builtin -nostdlib \ asflags-y := -D__VDSO32__ -s obj-y += vdso32_wrapper.o -extra-y += vdso32.lds +targets += vdso32.lds CPPFLAGS_vdso32.lds += -P -C -Upowerpc # Force dependency (incbin is bad) diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 217bb630f8f9..1d23e2771dba 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -47,7 +47,6 @@ V_FUNCTION_END(__kernel_get_syscall_map) * * returns the timebase frequency in HZ */ -#ifndef CONFIG_PPC_BOOK3S_601 V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 @@ -60,4 +59,3 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) -#endif diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 5206c2eb2a1d..7eadac74c7f9 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -144,13 +144,11 @@ VERSION __kernel_datapage_offset; __kernel_get_syscall_map; -#ifndef CONFIG_PPC_BOOK3S_601 __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; __kernel_time; __kernel_get_tbfreq; -#endif __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp32; diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 32ebb3522ea1..dfd34f68bfa1 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -17,7 +17,7 @@ ccflags-y := -shared -fno-common -fno-builtin -nostdlib \ asflags-y := -D__VDSO64__ -s obj-y += vdso64_wrapper.o -extra-y += vdso64.lds +targets += vdso64.lds CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) # Force dependency (incbin is bad) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 49db50d1db04..44bf567b6589 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -558,12 +558,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, @@ -879,13 +879,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) #ifdef CONFIG_KVM_XICS /* - * Free the XIVE devices which are not directly freed by the + * Free the XIVE and XICS devices which are not directly freed by the * device 'release' method */ kfree(kvm->arch.xive_devices.native); kvm->arch.xive_devices.native = NULL; kfree(kvm->arch.xive_devices.xics_on_xive); kvm->arch.xive_devices.xics_on_xive = NULL; + kfree(kvm->arch.xics_device); + kvm->arch.xics_device = NULL; #endif /* CONFIG_KVM_XICS */ } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 22a677b18695..bb35490400e9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -347,7 +347,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, return __radix_pte_update(ptep, clr, set); } -void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, +static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, pte_t *ptep, pte_t pte) { radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 1a529df0ab44..8da93fdfa59e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -283,7 +283,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *siter; struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; - int ret = -ENOMEM; + int ret; if (!args->size || args->page_shift < 12 || args->page_shift > 34 || (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) @@ -489,7 +489,7 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, return ret; } -long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, +static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, unsigned long entry, unsigned long ua, enum dma_data_direction dir) { diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ac6ac192b8bb..470e7c518a10 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -237,7 +237,7 @@ static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, return ret; } -extern void iommu_tce_kill_rm(struct iommu_table *tbl, +static void iommu_tce_kill_rm(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { if (tbl->it_ops->tce_kill) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 4ba06a2a306c..e3b1839fc251 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -111,7 +111,7 @@ module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); #ifdef CONFIG_KVM_XICS -static struct kernel_param_ops module_param_ops = { +static const struct kernel_param_ops module_param_ops = { .set = param_set_int, .get = param_get_int, }; @@ -3442,9 +3442,19 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_psscr = mfspr(SPRN_PSSCR); unsigned long host_pidr = mfspr(SPRN_PID); + /* + * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0, + * so set HDICE before writing HDEC. + */ + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE); + isync(); + hdec = time_limit - mftb(); - if (hdec < 0) + if (hdec < 0) { + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + isync(); return BOOK3S_INTERRUPT_HV_DECREMENTER; + } mtspr(SPRN_HDEC, hdec); if (vc->tb_offset) { @@ -3530,6 +3540,13 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, */ asm volatile("eieio; tlbsync; ptesync"); + /* + * cp_abort is required if the processor supports local copy-paste + * to clear the copy buffer that was under control of the guest. + */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + asm volatile(PPC_CP_ABORT); + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ isync(); @@ -3558,7 +3575,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, * Virtual-mode guest entry for POWER9 and later when the host and * guest are both using the radix MMU. The LPIDR has already been set. */ -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, +static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -3572,7 +3589,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, dec = mfspr(SPRN_DEC); tb = mftb(); - if (dec < 512) + if (dec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; local_paca->kvm_hstate.dec_expires = dec + tb; if (local_paca->kvm_hstate.dec_expires < time_limit) @@ -5250,6 +5267,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, case KVM_PPC_ALLOCATE_HTAB: { u32 htab_order; + /* If we're a nested hypervisor, we currently only support radix */ + if (kvmhv_on_pseries()) { + r = -EOPNOTSUPP; + break; + } + r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break; diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 59822cba454d..327417d79eac 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -58,13 +58,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* * Put whatever is in the decrementer into the * hypervisor decrementer. + * Because of a hardware deviation in P8 and P9, + * we need to set LPCR[HDICE] before writing HDEC. */ -BEGIN_FTR_SECTION ld r5, HSTATE_KVM_VCORE(r13) ld r6, VCORE_KVM(r5) ld r9, KVM_HOST_LPCR(r6) - andis. r9, r9, LPCR_LD@h -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ori r8, r9, LPCR_HDICE + mtspr SPRN_LPCR, r8 + isync + andis. r0, r9, LPCR_LD@h mfspr r8,SPRN_DEC mftb r7 BEGIN_FTR_SECTION diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 6822d23a2da4..33b58549a9aa 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -569,7 +569,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) kvmhv_set_nested_ptbl(gp); } -struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) +static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) { struct kvm_nested_guest *gp; long shadow_lpid; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 4d7e5610731a..c2c9c733f359 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -764,7 +764,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return ics_rm_eoi(vcpu, irq); } -unsigned long eoi_rc; +static unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) { diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 799d6d0f4ead..cd9995ee8441 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1831,6 +1831,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_RADIX_PREFETCH_BUG) #endif /* CONFIG_PPC_RADIX_MMU */ /* + * cp_abort is required if the processor supports local copy-paste + * to clear the copy buffer that was under control of the guest. + */ +BEGIN_FTR_SECTION + PPC_CP_ABORT +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) + + /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 88fac22fbf09..b1fefa63e125 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -569,7 +569,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) #endif } -void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) +static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 381bf8dea193..5fee5a11550d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return -ENXIO; } -static void kvmppc_xics_free(struct kvm_device *dev) +/* + * Called when device fd is closed. kvm->lock is held. + */ +static void kvmppc_xics_release(struct kvm_device *dev) { struct kvmppc_xics *xics = dev->private; int i; struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + + pr_devel("Releasing xics device\n"); + + /* + * Since this is the device release function, we know that + * userspace does not have any open fd referring to the + * device. Therefore there can not be any of the device + * attribute set/get functions being executed concurrently, + * and similarly, the connect_vcpu and set/clr_mapped + * functions also cannot be being executed. + */ debugfs_remove(xics->dentry); + /* + * We should clean up the vCPU interrupt presenters first. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + /* + * Take vcpu->mutex to ensure that no one_reg get/set ioctl + * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently. + * Holding the vcpu->mutex also means that execution is + * excluded for the vcpu until the ICP was freed. When the vcpu + * can execute again, vcpu->arch.icp and vcpu->arch.irq_type + * have been cleared and the vcpu will not be going into the + * XICS code anymore. + */ + mutex_lock(&vcpu->mutex); + kvmppc_xics_free_icp(vcpu); + mutex_unlock(&vcpu->mutex); + } + if (kvm) kvm->arch.xics = NULL; - for (i = 0; i <= xics->max_icsid; i++) + for (i = 0; i <= xics->max_icsid; i++) { kfree(xics->ics[i]); - kfree(xics); + xics->ics[i] = NULL; + } + /* + * A reference of the kvmppc_xics pointer is now kept under + * the xics_device pointer of the machine for reuse. It is + * freed when the VM is destroyed for now until we fix all the + * execution paths. + */ kfree(dev); } +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm) +{ + struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device; + struct kvmppc_xics *xics = *kvm_xics_device; + + if (!xics) { + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + *kvm_xics_device = xics; + } else { + memset(xics, 0, sizeof(*xics)); + } + + return xics; +} + static int kvmppc_xics_create(struct kvm_device *dev, u32 type) { struct kvmppc_xics *xics; struct kvm *kvm = dev->kvm; - int ret = 0; - xics = kzalloc(sizeof(*xics), GFP_KERNEL); + pr_devel("Creating xics for partition\n"); + + /* Already there ? */ + if (kvm->arch.xics) + return -EEXIST; + + xics = kvmppc_xics_get_device(kvm); if (!xics) return -ENOMEM; dev->private = xics; xics->dev = dev; xics->kvm = kvm; - - /* Already there ? */ - if (kvm->arch.xics) - ret = -EEXIST; - else - kvm->arch.xics = xics; - - if (ret) { - kfree(xics); - return ret; - } + kvm->arch.xics = xics; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206) && @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = { .name = "kvm-xics", .create = kvmppc_xics_create, .init = kvmppc_xics_init, - .destroy = kvmppc_xics_free, + .release = kvmppc_xics_release, .set_attr = xics_set_attr, .get_attr = xics_get_attr, .has_attr = xics_has_attr, @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, return -EPERM; if (xics->kvm != vcpu->kvm) return -EPERM; - if (vcpu->arch.irq_type) + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; r = kvmppc_xics_create_icp(vcpu, xcpu); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index bdea91df1497..d0c2db0e07fa 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private) return 0; } -static int xive_native_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, xive_native_debug_show, inode->i_private); -} - -static const struct file_operations xive_native_debug_fops = { - .open = xive_native_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(xive_native_debug); static void xive_native_debugfs_init(struct kvmppc_xive *xive) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3e1c9f08e302..b1abcb816439 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1747,12 +1747,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -1773,7 +1773,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -ENOTSUPP; + return -EOPNOTSUPP; } void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S index ec5cd2dede35..27d9070617df 100644 --- a/arch/powerpc/lib/checksum_32.S +++ b/arch/powerpc/lib/checksum_32.S @@ -236,7 +236,6 @@ _GLOBAL(csum_partial_copy_generic) slwi r0,r0,8 adde r12,r12,r0 66: addze r3,r12 - addi r1,r1,16 beqlr+ cr7 rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ blr diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 8c3934ea6220..2333625b5e31 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -21,21 +21,18 @@ static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr, struct ppc_inst *patch_addr) { - int err = 0; - - if (!ppc_inst_prefixed(instr)) { - __put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw"); - } else { - __put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std"); - } - - if (err) - return err; + if (!ppc_inst_prefixed(instr)) + __put_user_asm_goto(ppc_inst_val(instr), patch_addr, failed, "stw"); + else + __put_user_asm_goto(ppc_inst_as_u64(instr), patch_addr, failed, "std"); asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr), "r" (exec_addr)); return 0; + +failed: + return -EFAULT; } int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index caee8cc77e19..855457ed09b5 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -108,11 +108,11 @@ static nokprobe_inline long address_ok(struct pt_regs *regs, { if (!user_mode(regs)) return 1; - if (__access_ok(ea, nb, USER_DS)) + if (__access_ok(ea, nb)) return 1; - if (__access_ok(ea, 1, USER_DS)) + if (__access_ok(ea, 1)) /* Access overlaps the end of the user region */ - regs->dar = USER_DS.seg; + regs->dar = TASK_SIZE_MAX - 1; else regs->dar = ea; return 0; @@ -219,10 +219,13 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr, ea += regs->gpr[ra]; else if (!prefix_r && !ra) ; /* Leave ea as is */ - else if (prefix_r && !ra) + else if (prefix_r) ea += regs->nip; - else if (prefix_r && ra) - ; /* Invalid form. Should already be checked for by caller! */ + + /* + * (prefix_r && ra) is an invalid form. Should already be + * checked for by caller! + */ return ea; } diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 1690d369688b..b2c912e517b9 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -15,6 +15,7 @@ */ #include <linux/pgtable.h> +#include <linux/init.h> #include <asm/reg.h> #include <asm/page.h> #include <asm/cputable.h> @@ -199,11 +200,9 @@ _GLOBAL(add_hash_page) * covered by a BAT). -- paulus */ mfmsr r9 - SYNC rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear MSR_DR */ mtmsr r0 - SYNC_601 isync #ifdef CONFIG_SMP @@ -262,7 +261,6 @@ _GLOBAL(add_hash_page) /* reenable interrupts and DR */ mtmsr r9 - SYNC_601 isync lwz r0,4(r1) @@ -287,9 +285,9 @@ _ASM_NOKPROBE_SYMBOL(add_hash_page) * * For speed, 4 of the instructions get patched once the size and * physical address of the hash table are known. These definitions - * of Hash_base and Hash_bits below are just an example. + * of Hash_base and Hash_bits below are for the early hash table. */ -Hash_base = 0xc0180000 +Hash_base = early_hash Hash_bits = 12 /* e.g. 256kB hash table */ Hash_msk = (((1 << Hash_bits) - 1) * 64) @@ -310,6 +308,7 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) #define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1) #define HASH_RIGHT 31-LG_PTEG_SIZE +__REF _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ @@ -476,6 +475,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) sync /* make sure pte updates get to memory */ blr + .previous _ASM_NOKPROBE_SYMBOL(create_hpte) .section .bss @@ -496,6 +496,7 @@ htab_hash_searches: * * We assume that there is a hash table in use (Hash != 0). */ +__REF _GLOBAL(flush_hash_pages) /* * We disable interrupts here, even on UP, because we want @@ -506,11 +507,9 @@ _GLOBAL(flush_hash_pages) * covered by a BAT). -- paulus */ mfmsr r10 - SYNC rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear MSR_DR */ mtmsr r0 - SYNC_601 isync /* First find a PTE in the range that has _PAGE_HASHPTE set */ @@ -629,9 +628,9 @@ _GLOBAL(flush_hash_pages) #endif 19: mtmsr r10 - SYNC_601 isync blr + .previous EXPORT_SYMBOL(flush_hash_pages) _ASM_NOKPROBE_SYMBOL(flush_hash_pages) @@ -643,11 +642,9 @@ _GLOBAL(_tlbie) lwz r8,TASK_CPU(r2) oris r8,r8,11 mfmsr r10 - SYNC rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear DR */ mtmsr r0 - SYNC_601 isync lis r9,mmu_hash_lock@h ori r9,r9,mmu_hash_lock@l @@ -664,7 +661,6 @@ _GLOBAL(_tlbie) li r0,0 stw r0,0(r9) /* clear mmu_hash_lock */ mtmsr r10 - SYNC_601 isync #else /* CONFIG_SMP */ tlbie r3 @@ -681,11 +677,9 @@ _GLOBAL(_tlbia) lwz r8,TASK_CPU(r2) oris r8,r8,10 mfmsr r10 - SYNC rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear DR */ mtmsr r0 - SYNC_601 isync lis r9,mmu_hash_lock@h ori r9,r9,mmu_hash_lock@l @@ -709,7 +703,6 @@ _GLOBAL(_tlbia) li r0,0 stw r0,0(r9) /* clear mmu_hash_lock */ mtmsr r10 - SYNC_601 isync #endif /* CONFIG_SMP */ blr diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index d426eaf76bb0..a59e7ec98180 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -31,6 +31,8 @@ #include <mm/mmu_decl.h> +u8 __initdata early_hash[SZ_256K] __aligned(SZ_256K) = {0}; + struct hash_pte *Hash; static unsigned long Hash_size, Hash_mask; unsigned long _SDR1; @@ -73,23 +75,13 @@ unsigned long p_block_mapped(phys_addr_t pa) static int find_free_bat(void) { int b; + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { - for (b = 0; b < 4; b++) { - struct ppc_bat *bat = BATS[b]; - - if (!(bat[0].batl & 0x40)) - return b; - } - } else { - int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; - for (b = 0; b < n; b++) { - struct ppc_bat *bat = BATS[b]; - - if (!(bat[1].batu & 3)) - return b; - } + if (!(bat[1].batu & 3)) + return b; } return -1; } @@ -97,7 +89,7 @@ static int find_free_bat(void) /* * This function calculates the size of the larger block usable to map the * beginning of an area based on the start address and size of that area: - * - max block size is 8M on 601 and 256 on other 6xx. + * - max block size is 256 on 6xx. * - base address must be aligned to the block size. So the maximum block size * is identified by the lowest bit set to 1 in the base address (for instance * if base is 0x16000000, max size is 0x02000000). @@ -106,7 +98,7 @@ static int find_free_bat(void) */ static unsigned int block_size(unsigned long base, unsigned long top) { - unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M; + unsigned int max_size = SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; @@ -117,7 +109,6 @@ static unsigned int block_size(unsigned long base, unsigned long top) * Set up one of the IBAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. - * Only for 603+ ... */ static void setibat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -214,9 +205,6 @@ void mmu_mark_initmem_nx(void) unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; unsigned long size; - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) - return; - for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { size = block_size(base, top); setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); @@ -253,9 +241,6 @@ void mmu_mark_rodata_ro(void) int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) - return; - for (i = 0; i < nb; i++) { struct ppc_bat *bat = BATS[i]; @@ -294,35 +279,22 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, flags &= ~_PAGE_COHERENT; bl = (size >> 17) - 1; - if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { - /* 603, 604, etc. */ - /* Do DBAT first */ - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; - bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ - bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) - bat[1].batu |= 1; /* Vp = 1 */ - if (flags & _PAGE_GUARDED) { - /* G bit must be zero in IBATs */ - flags &= ~_PAGE_EXEC; - } - if (flags & _PAGE_EXEC) - bat[0] = bat[1]; - else - bat[0].batu = bat[0].batl = 0; - } else { - /* 601 cpu */ - if (bl > BL_8M) - bl = BL_8M; - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT); - wimgxpp |= (flags & _PAGE_RW)? - ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; - bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ - bat->batl = phys | bl | 0x40; /* V=1 */ + /* Do DBAT first */ + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT | _PAGE_GUARDED); + wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[1].batu |= 1; /* Vp = 1 */ + if (flags & _PAGE_GUARDED) { + /* G bit must be zero in IBATs */ + flags &= ~_PAGE_EXEC; } + if (flags & _PAGE_EXEC) + bat[0] = bat[1]; + else + bat[0].batu = bat[0].batl = 0; bat_addrs[index].start = virt; bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; @@ -425,15 +397,6 @@ void __init MMU_init_hw(void) hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; if (lg_n_hpteg > 16) hash_mb2 = 16 - LG_HPTEG_SIZE; - - /* - * When KASAN is selected, there is already an early temporary hash - * table and the switch to the final hash table is done later. - */ - if (IS_ENABLED(CONFIG_KASAN)) - return; - - MMU_init_hw_patch(); } void __init MMU_init_hw_patch(void) @@ -441,6 +404,9 @@ void __init MMU_init_hw_patch(void) unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); unsigned int hash = (unsigned int)Hash - PAGE_OFFSET; + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + if (ppc_md.progress) ppc_md.progress("hash:patch", 0x345); if (ppc_md.progress) @@ -474,11 +440,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, */ BUG_ON(first_memblock_base != 0); - /* 601 can only access 16MB at the moment */ - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); - else /* Anything else has 256M mapped */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); + memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_256M)); } void __init print_system_hash_info(void) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index cf20e5229ce1..0203cdf48c54 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -82,7 +82,7 @@ static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) for (set = 0; set < num_sets; set++) tlbiel_hash_set_isa206(set, is); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) @@ -110,7 +110,7 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) */ tlbiel_hash_set_isa300(0, is, 0, 2, 1); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } @@ -303,7 +303,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); if (use_local) { __tlbiel(vpn, psize, apsize, ssize); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } else { __tlbie(vpn, psize, apsize, ssize); fixup_tlbie_vpn(vpn, psize, apsize, ssize); @@ -879,7 +879,7 @@ static void native_flush_hash_range(unsigned long number, int local) __tlbiel(vpn, psize, psize, ssize); } pte_iterate_hashed_end(); } - asm volatile("ptesync":::"memory"); + ppc_after_tlbiel_barrier(); } else { int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b830adee51f5..24702c0a92e0 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -260,8 +260,12 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", vstart, vend, pstart, prot, psize, ssize); - for (vaddr = vstart, paddr = pstart; vaddr < vend; - vaddr += step, paddr += step) { + /* Carefully map only the possible range */ + vaddr = ALIGN(vstart, step); + paddr = ALIGN(pstart, step); + vend = ALIGN_DOWN(vend, step); + + for (; vaddr < vend; vaddr += step, paddr += step) { unsigned long hash, hpteg; unsigned long vsid = get_kernel_vsid(vaddr, ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); @@ -343,7 +347,9 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, if (!mmu_hash_ops.hpte_removebolted) return -ENODEV; - for (vaddr = vstart; vaddr < vend; vaddr += step) { + /* Unmap the full range specificied */ + vaddr = ALIGN_DOWN(vstart, step); + for (;vaddr < vend; vaddr += step) { rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); if (rc == -ENOENT) { ret = -ENOENT; diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h index 7eda0d30d765..c12d78ee42f5 100644 --- a/arch/powerpc/mm/book3s64/internal.h +++ b/arch/powerpc/mm/book3s64/internal.h @@ -13,4 +13,6 @@ static inline bool stress_slb(void) return static_branch_unlikely(&stress_slb_key); } +void slb_setup_new_exec(void); + #endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */ diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 0ba30b8b935b..1c54821de7bf 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -21,6 +21,8 @@ #include <asm/mmu_context.h> #include <asm/pgalloc.h> +#include "internal.h" + static DEFINE_IDA(mmu_context_ida); static int alloc_context_id(int min_id, int max_id) @@ -48,8 +50,6 @@ int hash__alloc_context_id(void) } EXPORT_SYMBOL_GPL(hash__alloc_context_id); -void slb_setup_new_exec(void); - static int realloc_context_ids(mm_context_t *ctx) { int i, id; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index cc72666e891a..3adcf730f478 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -34,7 +34,7 @@ unsigned int mmu_pid_bits; unsigned int mmu_base_pid; -unsigned int radix_mem_block_size __ro_after_init; +unsigned long radix_mem_block_size __ro_after_init; static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) @@ -276,6 +276,7 @@ static int __meminit create_physical_mapping(unsigned long start, int psize; start = ALIGN(start, PAGE_SIZE); + end = ALIGN_DOWN(end, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { unsigned long gap, previous_size; int rc; @@ -497,7 +498,7 @@ static int __init probe_memory_block_size(unsigned long node, const char *uname, depth, void *data) { unsigned long *mem_block_size = (unsigned long *)data; - const __be64 *prop; + const __be32 *prop; int len; if (depth != 1) @@ -507,13 +508,14 @@ static int __init probe_memory_block_size(unsigned long node, const char *uname, return 0; prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len); - if (!prop || len < sizeof(__be64)) + + if (!prop || len < dt_root_size_cells * sizeof(__be32)) /* * Nothing in the device tree */ *mem_block_size = MIN_MEMORY_BLOCK_SIZE; else - *mem_block_size = be64_to_cpup(prop); + *mem_block_size = of_read_number(prop, dt_root_size_cells); return 1; } diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 0d233763441f..b487b489d4b6 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -65,7 +65,7 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } void radix__tlbiel_all(unsigned int action) @@ -296,7 +296,7 @@ static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) /* For PWC, only one flush is needed */ if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); return; } @@ -304,7 +304,7 @@ static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) __tlbiel_pid(pid, set, RIC_FLUSH_TLB); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory"); } @@ -431,7 +431,7 @@ static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid, asm volatile("ptesync": : :"memory"); __tlbiel_va(va, pid, ap, ric); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static inline void _tlbiel_va_range(unsigned long start, unsigned long end, @@ -442,7 +442,7 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, psize); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static inline void __tlbie_va_range(unsigned long start, unsigned long end, @@ -645,19 +645,29 @@ static void do_exit_flush_lazy_tlb(void *arg) struct mm_struct *mm = arg; unsigned long pid = mm->context.id; + /* + * A kthread could have done a mmget_not_zero() after the flushing CPU + * checked mm_is_singlethreaded, and be in the process of + * kthread_use_mm when interrupted here. In that case, current->mm will + * be set to mm, because kthread_use_mm() setting ->mm and switching to + * the mm is done with interrupts off. + */ if (current->mm == mm) - return; /* Local CPU */ + goto out_flush; if (current->active_mm == mm) { - /* - * Must be a kernel thread because sender is single-threaded. - */ - BUG_ON(current->mm); + WARN_ON_ONCE(current->mm != NULL); + /* Is a kernel thread and is using mm as the lazy tlb */ mmgrab(&init_mm); - switch_mm(mm, &init_mm, current); current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); mmdrop(mm); } + + atomic_dec(&mm->context.active_cpus); + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm)); + +out_flush: _tlbiel_pid(pid, RIC_FLUSH_ALL); } @@ -672,7 +682,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm) */ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, (void *)mm, 1); - mm_reset_thread_local(mm); } void radix__flush_tlb_mm(struct mm_struct *mm) @@ -940,7 +949,7 @@ is_local: if (hflush) __tlbiel_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } else if (cputlb_use_tlbie()) { asm volatile("ptesync": : :"memory"); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 156c38f89511..c30fcbfa0e32 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -765,8 +765,8 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) if (id == LINEAR_MAP_REGION_ID) { - /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) + /* We only support upto H_MAX_PHYSMEM_BITS */ + if ((ea & EA_MASK) > (1UL << H_MAX_PHYSMEM_BITS)) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 5ab4f868e919..30260b5d146d 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -11,7 +11,7 @@ #include <linux/types.h> #include <linux/highmem.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <asm/tlbflush.h> #include <asm/dma.h> diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index b2eeea39684c..9af3832c9d8d 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -389,10 +389,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop) if (!drmem_info->lmbs) return; - for_each_drmem_lmb(lmb) { + for_each_drmem_lmb(lmb) read_drconf_v1_cell(lmb, &prop); - lmb_set_nid(lmb); - } } static void __init init_drmem_v2_lmbs(const __be32 *prop) @@ -437,8 +435,6 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop) lmb->aa_index = dr_cell.aa_index; lmb->flags = dr_cell.flags; - - lmb_set_nid(lmb); } } } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 26292544630f..36c3800769fb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -180,7 +180,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz if (!hpdp) return NULL; - if (IS_ENABLED(CONFIG_PPC_8xx) && sz == SZ_512K) + if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) return pte_alloc_map(mm, (pmd_t *)hpdp, addr); BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); @@ -330,10 +330,24 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif get_hugepd_cache_index(pdshift - shift)); } -static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) +static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { + unsigned long start = addr; pgtable_t token = pmd_pgtable(*pmd); + start &= PMD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); @@ -363,7 +377,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, */ WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx)); - hugetlb_free_pte_range(tlb, pmd, addr); + hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling); continue; } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 8459056cce67..386be136026e 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -162,16 +162,16 @@ static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) return next++; } -static __meminit void vmemmap_list_populate(unsigned long phys, - unsigned long start, - int node) +static __meminit int vmemmap_list_populate(unsigned long phys, + unsigned long start, + int node) { struct vmemmap_backing *vmem_back; vmem_back = vmemmap_list_alloc(node); if (unlikely(!vmem_back)) { - WARN_ON(1); - return; + pr_debug("vmemap list allocation failed\n"); + return -ENOMEM; } vmem_back->phys = phys; @@ -179,6 +179,7 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmem_back->list = vmemmap_list; vmemmap_list = vmem_back; + return 0; } static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, @@ -199,6 +200,7 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { + bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; /* Align to the page size of the linear mapping. */ @@ -228,13 +230,32 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, p = vmemmap_alloc_block_buf(page_size, node, altmap); if (!p) pr_debug("altmap block allocation failed, falling back to system memory"); + else + altmap_alloc = true; } - if (!p) + if (!p) { p = vmemmap_alloc_block_buf(page_size, node, NULL); + altmap_alloc = false; + } if (!p) return -ENOMEM; - vmemmap_list_populate(__pa(p), start, node); + if (vmemmap_list_populate(__pa(p), start, node)) { + /* + * If we don't populate vmemap list, we don't have + * the ability to free the allocated vmemmap + * pages in section_deactivate. Hence free them + * here. + */ + int nr_pfns = page_size >> PAGE_SHIFT; + unsigned long page_order = get_order(page_size); + + if (altmap_alloc) + vmem_altmap_free(altmap, nr_pfns); + else + free_pages((unsigned long)p, page_order); + return -ENOMEM; + } pr_debug(" * %016lx..%016lx allocated at %p\n", start, start + page_size, p); @@ -264,10 +285,8 @@ static unsigned long vmemmap_list_free(unsigned long start) vmem_back_prev = vmem_back; } - if (unlikely(!vmem_back)) { - WARN_ON(1); + if (unlikely(!vmem_back)) return 0; - } /* remove it from vmemmap_list */ if (vmem_back == vmemmap_list) /* remove head */ diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 26fda3203320..cf8770b1a692 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -127,8 +127,7 @@ void __init kasan_mmu_init(void) { int ret; - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE) || - IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); if (ret) @@ -140,10 +139,10 @@ void __init kasan_init(void) { phys_addr_t base, end; u64 i; + int ret; for_each_mem_range(i, &base, &end) { phys_addr_t top = min(end, total_lowmem); - int ret; if (base >= top) continue; @@ -153,6 +152,13 @@ void __init kasan_init(void) panic("kasan: kasan_init_region() failed"); } + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + kasan_remap_early_shadow_ro(); clear_page(kasan_early_shadow_page); @@ -168,22 +174,6 @@ void __init kasan_late_init(void) kasan_unmap_early_shadow_vmalloc(); } -#ifdef CONFIG_PPC_BOOK3S_32 -u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; - -static void __init kasan_early_hash_table(void) -{ - unsigned int hash = __pa(early_hash); - - modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16); - modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16); - - Hash = (struct hash_pte *)early_hash; -} -#else -static void __init kasan_early_hash_table(void) {} -#endif - void __init kasan_early_init(void) { unsigned long addr = KASAN_SHADOW_START; @@ -199,7 +189,4 @@ void __init kasan_early_init(void) next = pgd_addr_end(addr, end); pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); } while (pmd++, addr = next, addr != end); - - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_early_hash_table(); } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5e2e7c0a8f1a..01ec2a252f09 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -49,6 +49,7 @@ #include <asm/swiotlb.h> #include <asm/rtas.h> #include <asm/kasan.h> +#include <asm/svm.h> #include <mm/mmu_decl.h> @@ -283,7 +284,10 @@ void __init mem_init(void) * back to to-down. */ memblock_set_bottom_up(true); - swiotlb_init(0); + if (is_secure_guest()) + svm_swiotlb_init(); + else + swiotlb_init(0); #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index d2b37146ae6c..231ca95f9ffb 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -244,13 +244,6 @@ void set_context(unsigned long id, pgd_t *pgd) mb(); } -void flush_instruction_cache(void) -{ - isync(); - mtspr(SPRN_IC_CST, IDC_INVALL); - isync(); -} - #ifdef CONFIG_PPC_KUEP void __init setup_kuep(bool disabled) { diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c index 0c294827d6e5..36bda962d3b3 100644 --- a/arch/powerpc/mm/nohash/fsl_booke.c +++ b/arch/powerpc/mm/nohash/fsl_booke.c @@ -219,6 +219,22 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } +void flush_instruction_cache(void) +{ + unsigned long tmp; + + if (IS_ENABLED(CONFIG_E200)) { + tmp = mfspr(SPRN_L1CSR0); + tmp |= L1CSR0_CFI | L1CSR0_CLFC; + mtspr(SPRN_L1CSR0, tmp); + } else { + tmp = mfspr(SPRN_L1CSR1); + tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; + mtspr(SPRN_L1CSR1, tmp); + } + isync(); +} + /* * MMU_init_hw does the chip-specific initialization of the MMU hardware. */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 14514585db98..5872f69141d5 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -83,16 +83,12 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { }; #elif defined(CONFIG_PPC_8xx) struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - /* we only manage 4k and 16k pages as normal pages */ -#ifdef CONFIG_PPC_4K_PAGES [MMU_PAGE_4K] = { .shift = 12, }, -#else [MMU_PAGE_16K] = { .shift = 14, }, -#endif [MMU_PAGE_512K] = { .shift = 19, }, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f4e20d8e6c02..63f61d8b55e5 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -430,7 +430,7 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. */ -static int of_drconf_to_nid_single(struct drmem_lmb *lmb) +int of_drconf_to_nid_single(struct drmem_lmb *lmb) { struct assoc_arrays aa = { .arrays = NULL }; int default_nid = NUMA_NO_NODE; @@ -507,6 +507,11 @@ static int numa_setup_cpu(unsigned long lcpu) int fcpu = cpu_first_thread_sibling(lcpu); int nid = NUMA_NO_NODE; + if (!cpu_present(lcpu)) { + set_cpu_numa_node(lcpu, first_online_node); + return first_online_node; + } + /* * If a valid cpu-to-node mapping is already available, use it * directly instead of querying the firmware, since it represents @@ -723,21 +728,22 @@ static int __init parse_numa_properties(void) */ for_each_present_cpu(i) { struct device_node *cpu; - int nid; - - cpu = of_get_cpu_node(i, NULL); - BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); - of_node_put(cpu); + int nid = vphn_get_nid(i); /* * Don't fall back to default_nid yet -- we will plug * cpus into nodes once the memory scan has discovered * the topology. */ - if (nid < 0) - continue; - node_set_online(nid); + if (nid == NUMA_NO_NODE) { + cpu = of_get_cpu_node(i, NULL); + BUG_ON(!cpu); + nid = of_node_to_nid_single(cpu); + of_node_put(cpu); + } + + if (likely(nid > 0)) + node_set_online(nid); } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); @@ -888,7 +894,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { struct device_node *rtas; - u32 numnodes, i; + const __be32 *domains; + int prop_length, max_nodes; + u32 i; if (!numa_enabled) return; @@ -897,16 +905,31 @@ static void __init find_possible_nodes(void) if (!rtas) return; - if (of_property_read_u32_index(rtas, - "ibm,max-associativity-domains", - min_common_depth, &numnodes)) - goto out; + /* + * ibm,current-associativity-domains is a fairly recent property. If + * it doesn't exist, then fallback on ibm,max-associativity-domains. + * Current denotes what the platform can support compared to max + * which denotes what the Hypervisor can support. + */ + domains = of_get_property(rtas, "ibm,current-associativity-domains", + &prop_length); + if (!domains) { + domains = of_get_property(rtas, "ibm,max-associativity-domains", + &prop_length); + if (!domains) + goto out; + } - for (i = 0; i < numnodes; i++) { + max_nodes = of_read_number(&domains[min_common_depth], 1); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); } + prop_length /= sizeof(int); + if (prop_length > min_common_depth + 2) + coregroup_enabled = 1; + out: of_node_put(rtas); } @@ -915,6 +938,16 @@ void __init mem_topology_setup(void) { int cpu; + /* + * Linux/mm assumes node 0 to be online at boot. However this is not + * true on PowerPC, where node 0 is similar to any other node, it + * could be cpuless, memoryless node. So force node 0 to be offline + * for now. This will prevent cpuless, memoryless node 0 showing up + * unnecessarily as online. If a node has cpus or memory that need + * to be online, then node will anyway be marked online. + */ + node_set_offline(0); + if (parse_numa_properties()) setup_nonnuma(); @@ -932,8 +965,17 @@ void __init mem_topology_setup(void) reset_numa_cpu_lookup_table(); - for_each_present_cpu(cpu) + for_each_possible_cpu(cpu) { + /* + * Powerpc with CONFIG_NUMA always used to have a node 0, + * even if it was memoryless or cpuless. For all cpus that + * are possible but not present, cpu_to_node() would point + * to node 0. To remove a cpuless, memoryless dummy node, + * powerpc need to make sure all possible but not present + * cpu_to_node are set to a proper node. + */ numa_setup_cpu(cpu); + } } void __init initmem_init(void) @@ -1200,6 +1242,31 @@ int find_and_online_cpu_nid(int cpu) return new_nid; } +int cpu_to_coregroup_id(int cpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + int index; + + if (cpu < 0 || cpu > nr_cpu_ids) + return -1; + + if (!coregroup_enabled) + goto out; + + if (!firmware_has_feature(FW_FEATURE_VPHN)) + goto out; + + if (vphn_get_associativity(cpu, associativity)) + goto out; + + index = of_read_number(associativity, 1); + if (index > min_common_depth + 1) + return of_read_number(&associativity[index - 1], 1); + +out: + return cpu_to_core_id(cpu); +} + static int topology_update_init(void) { topology_inited = 1; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9c0547d77af3..15555c95cebc 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -184,9 +184,6 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - /* Add the pte bit when trying to set a pte */ - pte = pte_mkpte(pte); - /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. @@ -266,8 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ pmd_t *pmd = pmd_off(mm, addr); pte_basic_t val; pte_basic_t *entry = &ptep->pte; - int num = is_hugepd(*((hugepd_t *)pmd)) ? 1 : SZ_512K / SZ_4K; - int i; + int num, i; /* * Make sure hardware valid bit is not set. We don't do @@ -275,11 +271,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = pte_mkpte(pte); - pte = set_pte_filter(pte); val = pte_val(pte); + + num = number_of_cells_per_pte(pmd, val, 1); + for (i = 0; i < num; i++, entry++, val += SZ_4K) *entry = val; } diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 8a797dcbf475..86da2a669680 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -11,8 +11,13 @@ static const struct flag_info flag_array[] = { { +#ifdef CONFIG_PPC_16K_PAGES .mask = _PAGE_HUGE, .val = _PAGE_HUGE, +#else + .mask = _PAGE_SPS, + .val = _PAGE_SPS, +#endif .set = "huge", .clear = " ", }, { diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index e29b338d499f..c4c628b03cf8 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -12,62 +12,6 @@ #include "ptdump.h" -static char *pp_601(int k, int pp) -{ - if (pp == 0) - return k ? " " : "rwx"; - if (pp == 1) - return k ? "r x" : "rwx"; - if (pp == 2) - return "rwx"; - return "r x"; -} - -static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper) -{ - u32 blpi = upper & 0xfffe0000; - u32 k = (upper >> 2) & 3; - u32 pp = upper & 3; - phys_addr_t pbn = PHYS_BAT_ADDR(lower); - u32 bsm = lower & 0x3ff; - u32 size = (bsm + 1) << 17; - - seq_printf(m, "%d: ", idx); - if (!(lower & 0x40)) { - seq_puts(m, " -\n"); - return; - } - - seq_printf(m, "0x%08x-0x%08x ", blpi, blpi + size - 1); -#ifdef CONFIG_PHYS_64BIT - seq_printf(m, "0x%016llx ", pbn); -#else - seq_printf(m, "0x%08x ", pbn); -#endif - pt_dump_size(m, size); - - seq_printf(m, "Kernel %s User %s", pp_601(k & 2, pp), pp_601(k & 1, pp)); - - seq_puts(m, lower & _PAGE_WRITETHRU ? "w " : " "); - seq_puts(m, lower & _PAGE_NO_CACHE ? "i " : " "); - seq_puts(m, lower & _PAGE_COHERENT ? "m " : " "); - seq_puts(m, "\n"); -} - -#define BAT_SHOW_601(_m, _n, _l, _u) bat_show_601(_m, _n, mfspr(_l), mfspr(_u)) - -static int bats_show_601(struct seq_file *m, void *v) -{ - seq_puts(m, "---[ Block Address Translation ]---\n"); - - BAT_SHOW_601(m, 0, SPRN_IBAT0L, SPRN_IBAT0U); - BAT_SHOW_601(m, 1, SPRN_IBAT1L, SPRN_IBAT1U); - BAT_SHOW_601(m, 2, SPRN_IBAT2L, SPRN_IBAT2U); - BAT_SHOW_601(m, 3, SPRN_IBAT3L, SPRN_IBAT3U); - - return 0; -} - static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool is_d) { u32 bepi = upper & 0xfffe0000; @@ -146,9 +90,6 @@ static int bats_show_603(struct seq_file *m, void *v) static int bats_open(struct inode *inode, struct file *file) { - if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) - return single_open(file, bats_show_601, NULL); - return single_open(file, bats_show_603, NULL); } diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index df59d0bb121f..489f993100d5 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c @@ -572,7 +572,7 @@ void spu_sync_buffer(int spu_num, unsigned int *samples, * samples are recorded. * No big deal -- so we just drop a few samples. */ - pr_debug("SPU_PROF: No cached SPU contex " + pr_debug("SPU_PROF: No cached SPU context " "for SPU #%d. Dropping samples.\n", spu_num); goto out; } diff --git a/arch/powerpc/perf/hv-gpci-requests.h b/arch/powerpc/perf/hv-gpci-requests.h index e608f9db12dd..8965b4463d43 100644 --- a/arch/powerpc/perf/hv-gpci-requests.h +++ b/arch/powerpc/perf/hv-gpci-requests.h @@ -95,7 +95,7 @@ REQUEST(__field(0, 8, partition_id) #define REQUEST_NAME system_performance_capabilities #define REQUEST_NUM 0x40 -#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff" +#define REQUEST_IDX_KIND "starting_index=0xffffffff" #include I(REQUEST_BEGIN) REQUEST(__field(0, 1, perf_collect_privileged) __field(0x1, 1, capability_mask) @@ -223,7 +223,7 @@ REQUEST(__field(0, 2, partition_id) #define REQUEST_NAME system_hypervisor_times #define REQUEST_NUM 0xF0 -#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff" +#define REQUEST_IDX_KIND "starting_index=0xffffffff" #include I(REQUEST_BEGIN) REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors) __count(0x8, 8, time_spent_processing_virtual_processor_timers) @@ -234,7 +234,7 @@ REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors) #define REQUEST_NAME system_tlbie_count_and_time #define REQUEST_NUM 0xF4 -#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff" +#define REQUEST_IDX_KIND "starting_index=0xffffffff" #include I(REQUEST_BEGIN) REQUEST(__count(0, 8, tlbie_instructions_issued) /* diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 6884d16ec19b..d48413e28c39 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -48,6 +48,8 @@ EVENT_DEFINE_RANGE_FORMAT(length, config1, 24, 31); /* u32, byte offset */ EVENT_DEFINE_RANGE_FORMAT(offset, config1, 32, 63); +static cpumask_t hv_gpci_cpumask; + static struct attribute *format_attrs[] = { &format_attr_request.attr, &format_attr_starting_index.attr, @@ -94,7 +96,15 @@ static ssize_t kernel_version_show(struct device *dev, return sprintf(page, "0x%x\n", COUNTER_INFO_VERSION_CURRENT); } +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &hv_gpci_cpumask); +} + static DEVICE_ATTR_RO(kernel_version); +static DEVICE_ATTR_RO(cpumask); + HV_CAPS_ATTR(version, "0x%x\n"); HV_CAPS_ATTR(ga, "%d\n"); HV_CAPS_ATTR(expanded, "%d\n"); @@ -111,6 +121,15 @@ static struct attribute *interface_attrs[] = { NULL, }; +static struct attribute *cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group cpumask_attr_group = { + .attrs = cpumask_attrs, +}; + static struct attribute_group interface_group = { .name = "interface", .attrs = interface_attrs, @@ -120,20 +139,12 @@ static const struct attribute_group *attr_groups[] = { &format_group, &event_group, &interface_group, + &cpumask_attr_group, NULL, }; -#define HGPCI_REQ_BUFFER_SIZE 4096 -#define HGPCI_MAX_DATA_BYTES \ - (HGPCI_REQ_BUFFER_SIZE - sizeof(struct hv_get_perf_counter_info_params)) - static DEFINE_PER_CPU(char, hv_gpci_reqb[HGPCI_REQ_BUFFER_SIZE]) __aligned(sizeof(uint64_t)); -struct hv_gpci_request_buffer { - struct hv_get_perf_counter_info_params params; - uint8_t bytes[HGPCI_MAX_DATA_BYTES]; -} __packed; - static unsigned long single_gpci_request(u32 req, u32 starting_index, u16 secondary_index, u8 version_in, u32 offset, u8 length, u64 *value) @@ -275,6 +286,45 @@ static struct pmu h_gpci_pmu = { .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; +static int ppc_hv_gpci_cpu_online(unsigned int cpu) +{ + if (cpumask_empty(&hv_gpci_cpumask)) + cpumask_set_cpu(cpu, &hv_gpci_cpumask); + + return 0; +} + +static int ppc_hv_gpci_cpu_offline(unsigned int cpu) +{ + int target; + + /* Check if exiting cpu is used for collecting gpci events */ + if (!cpumask_test_and_clear_cpu(cpu, &hv_gpci_cpumask)) + return 0; + + /* Find a new cpu to collect gpci events */ + target = cpumask_last(cpu_active_mask); + + if (target < 0 || target >= nr_cpu_ids) { + pr_err("hv_gpci: CPU hotplug init failed\n"); + return -1; + } + + /* Migrate gpci events to the new target */ + cpumask_set_cpu(target, &hv_gpci_cpumask); + perf_pmu_migrate_context(&h_gpci_pmu, cpu, target); + + return 0; +} + +static int hv_gpci_cpu_hotplug_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE, + "perf/powerpc/hv_gcpi:online", + ppc_hv_gpci_cpu_online, + ppc_hv_gpci_cpu_offline); +} + static int hv_gpci_init(void) { int r; @@ -295,6 +345,11 @@ static int hv_gpci_init(void) return -ENODEV; } + /* init cpuhotplug */ + r = hv_gpci_cpu_hotplug_init(); + if (r) + return r; + /* sampling not supported */ h_gpci_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; diff --git a/arch/powerpc/perf/hv-gpci.h b/arch/powerpc/perf/hv-gpci.h index a3053eda5dcc..4d108262bed7 100644 --- a/arch/powerpc/perf/hv-gpci.h +++ b/arch/powerpc/perf/hv-gpci.h @@ -2,33 +2,6 @@ #ifndef LINUX_POWERPC_PERF_HV_GPCI_H_ #define LINUX_POWERPC_PERF_HV_GPCI_H_ -#include <linux/types.h> - -/* From the document "H_GetPerformanceCounterInfo Interface" v1.07 */ - -/* H_GET_PERF_COUNTER_INFO argument */ -struct hv_get_perf_counter_info_params { - __be32 counter_request; /* I */ - __be32 starting_index; /* IO */ - __be16 secondary_index; /* IO */ - __be16 returned_values; /* O */ - __be32 detail_rc; /* O, only needed when called via *_norets() */ - - /* - * O, size each of counter_value element in bytes, only set for version - * >= 0x3 - */ - __be16 cv_element_size; - - /* I, 0 (zero) for versions < 0x3 */ - __u8 counter_info_version_in; - - /* O, 0 (zero) if version < 0x3. Must be set to 0 when making hcall */ - __u8 counter_info_version_out; - __u8 reserved[0xC]; - __u8 counter_value[]; -} __packed; - /* * counter info version => fw version/reference (spec version) * diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 62d0b54086f8..9ed4fcccf8a9 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1426,8 +1426,6 @@ static void trace_imc_event_del(struct perf_event *event, int flags) static int trace_imc_event_init(struct perf_event *event) { - struct task_struct *target; - if (event->attr.type != event->pmu->type) return -ENOENT; @@ -1458,7 +1456,6 @@ static int trace_imc_event_init(struct perf_event *event) mutex_unlock(&imc_global_refc.lock); event->hw.idx = -1; - target = event->hw.target; event->pmu->task_ctx_nr = perf_hw_context; event->destroy = reset_global_refc; diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 964437adec18..2848904df638 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -288,6 +288,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) mask |= CNST_PMC_MASK(pmc); value |= CNST_PMC_VAL(pmc); + + /* + * PMC5 and PMC6 are used to count cycles and instructions and + * they do not support most of the constraint bits. Add a check + * to exclude PMC5/6 from most of the constraints except for + * EBB/BHRB. + */ + if (pmc >= 5) + goto ebb_bhrb; } if (pmc <= 4) { @@ -357,6 +366,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) } } +ebb_bhrb: if (!pmc && ebb) /* EBB events must specify the PMC */ return -1; diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 044de65e96b9..7025de5e60e7 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -13,6 +13,8 @@ #include <asm/firmware.h> #include <asm/cputable.h> +#include "internal.h" + #define EVENT_EBB_MASK 1ull #define EVENT_EBB_SHIFT PERF_EVENT_CONFIG_EBB_SHIFT #define EVENT_BHRB_MASK 1ull diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index 83148656b524..9dbe8f9b89b4 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -9,7 +9,6 @@ #define pr_fmt(fmt) "power10-pmu: " fmt #include "isa207-common.h" -#include "internal.h" /* * Raw event encoding for Power10: diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c index a62b2cd7914f..3e64b4a1511f 100644 --- a/arch/powerpc/perf/power5+-pmu.c +++ b/arch/powerpc/perf/power5+-pmu.c @@ -10,6 +10,8 @@ #include <asm/reg.h> #include <asm/cputable.h> +#include "internal.h" + /* * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) */ diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c index 8732b587cf71..017bb19b73fb 100644 --- a/arch/powerpc/perf/power5-pmu.c +++ b/arch/powerpc/perf/power5-pmu.c @@ -10,6 +10,8 @@ #include <asm/reg.h> #include <asm/cputable.h> +#include "internal.h" + /* * Bits in event code for POWER5 (not POWER5++) */ diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c index 0e318cf87129..189974478e9f 100644 --- a/arch/powerpc/perf/power6-pmu.c +++ b/arch/powerpc/perf/power6-pmu.c @@ -10,6 +10,8 @@ #include <asm/reg.h> #include <asm/cputable.h> +#include "internal.h" + /* * Bits in event code for POWER6 */ diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index 5e0bf09cf077..bacfab104a1a 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -10,6 +10,8 @@ #include <asm/reg.h> #include <asm/cputable.h> +#include "internal.h" + /* * Bits in event code for POWER7 */ diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c index d35223fb112c..7d78df97f272 100644 --- a/arch/powerpc/perf/ppc970-pmu.c +++ b/arch/powerpc/perf/ppc970-pmu.c @@ -9,6 +9,8 @@ #include <asm/reg.h> #include <asm/cputable.h> +#include "internal.h" + /* * Bits in event code for PPC970 */ diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c index 90ad6ac529d2..a5c898bb9bab 100644 --- a/arch/powerpc/platforms/44x/machine_check.c +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -7,6 +7,7 @@ #include <linux/ptrace.h> #include <asm/reg.h> +#include <asm/cacheflush.h> int machine_check_440A(struct pt_regs *regs) { diff --git a/arch/powerpc/platforms/44x/ppc476.c b/arch/powerpc/platforms/44x/ppc476.c index cba83eee685c..07f7e3ce67b5 100644 --- a/arch/powerpc/platforms/44x/ppc476.c +++ b/arch/powerpc/platforms/44x/ppc476.c @@ -86,8 +86,7 @@ static void __noreturn avr_reset_system(char *cmd) avr_halt_system(AVR_PWRCTL_RESET); } -static int avr_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int avr_probe(struct i2c_client *client) { avr_i2c_client = client; ppc_md.restart = avr_reset_system; @@ -104,7 +103,7 @@ static struct i2c_driver avr_driver = { .driver = { .name = "akebono-avr", }, - .probe = avr_probe, + .probe_new = avr_probe, .id_table = avr_id, }; diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c index 0967bdfb1691..409481016928 100644 --- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c +++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c @@ -142,7 +142,7 @@ static int mcu_gpiochip_remove(struct mcu *mcu) return 0; } -static int mcu_probe(struct i2c_client *client, const struct i2c_device_id *id) +static int mcu_probe(struct i2c_client *client) { struct mcu *mcu; int ret; @@ -221,7 +221,7 @@ static struct i2c_driver mcu_driver = { .name = "mcu-mpc8349emitx", .of_match_table = mcu_of_match_table, }, - .probe = mcu_probe, + .probe_new = mcu_probe, .remove = mcu_remove, .id_table = mcu_ids, }; diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index fda108bae95f..c6df294054fe 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -112,7 +112,7 @@ static void mpc85xx_take_timebase(void) local_irq_restore(flags); } -static void smp_85xx_mach_cpu_die(void) +static void smp_85xx_cpu_offline_self(void) { unsigned int cpu = smp_processor_id(); @@ -506,7 +506,7 @@ void __init mpc85xx_smp_init(void) if (qoriq_pm_ops) { smp_85xx_ops.give_timebase = mpc85xx_give_timebase; smp_85xx_ops.take_timebase = mpc85xx_take_timebase; - ppc_md.cpu_die = smp_85xx_mach_cpu_die; + smp_85xx_ops.cpu_offline_self = smp_85xx_cpu_offline_self; smp_85xx_ops.cpu_die = qoriq_cpu_kill; } #endif diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index fb7515b4fa9c..7a5e8f4541e3 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -199,21 +199,6 @@ source "drivers/cpuidle/Kconfig" endmenu -config PPC601_SYNC_FIX - bool "Workarounds for PPC601 bugs" - depends on PPC_BOOK3S_601 && PPC_PMAC - default y - help - Some versions of the PPC601 (the first PowerPC chip) have bugs which - mean that extra synchronization instructions are required near - certain instructions, typically those that make major changes to the - CPU state. These extra instructions reduce performance slightly. - If you say N here, these extra instructions will not be included, - resulting in a kernel which will run faster but may not run at all - on some systems with the PPC601 chip. - - If in doubt, say Y here. - config TAU bool "On-chip CPU temperature sensor support" depends on PPC_BOOK3S_32 @@ -223,12 +208,11 @@ config TAU temperature within 2-4 degrees Celsius. This option shows the current on-die temperature in /proc/cpuinfo if the cpu supports it. - Unfortunately, on some chip revisions, this sensor is very inaccurate - and in many cases, does not work at all, so don't assume the cpu - temp is actually what /proc/cpuinfo says it is. + Unfortunately, this sensor is very inaccurate when uncalibrated, so + don't assume the cpu temp is actually what /proc/cpuinfo says it is. config TAU_INT - bool "Interrupt driven TAU driver (DANGEROUS)" + bool "Interrupt driven TAU driver (EXPERIMENTAL)" depends on TAU help The TAU supports an interrupt driven mode which causes an interrupt @@ -236,12 +220,7 @@ config TAU_INT to get notified the temp has exceeded a range. With this option off, a timer is used to re-check the temperature periodically. - However, on some cpus it appears that the TAU interrupt hardware - is buggy and can cause a situation which would lead unexplained hard - lockups. - - Unless you are extending the TAU driver, or enjoy kernel/hardware - debugging, leave this option off. + If in doubt, say N here. config TAU_AVERAGE bool "Average high and low temp" diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 1dc9d3c81872..c194c4ae8bc7 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -20,7 +20,7 @@ choice depends on PPC32 help There are five families of 32 bit PowerPC chips supported. - The most common ones are the desktop and server CPUs (601, 603, + The most common ones are the desktop and server CPUs (603, 604, 740, 750, 74xx) CPUs from Freescale and IBM, with their embedded 512x/52xx/82xx/83xx/86xx counterparts. The other embedded parts, namely 4xx, 8xx, e200 (55xx) and e500 @@ -30,7 +30,7 @@ choice If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx. config PPC_BOOK3S_6xx - bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601" + bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" select PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT @@ -38,13 +38,6 @@ config PPC_BOOK3S_6xx select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK if !ADB_PMU -config PPC_BOOK3S_601 - bool "PowerPC 601" - select PPC_BOOK3S_32 - select PPC_FPU - select PPC_HAVE_KUAP - select HAVE_ARCH_VMAP_STACK - config PPC_85xx bool "Freescale 85xx" select E500 @@ -490,13 +483,12 @@ endmenu config VDSO32 def_bool y - depends on PPC32 || CPU_BIG_ENDIAN + depends on PPC32 || COMPAT help This symbol controls whether we build the 32-bit VDSO. We obviously want to do that if we're building a 32-bit kernel. If we're building - a 64-bit kernel then we only want a 32-bit VDSO if we're building for - big endian. That is because the only little endian configuration we - support is ppc64le which is 64-bit only. + a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling + COMPAT. choice prompt "Endianness selection" diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index 15437abe1f6d..b95c3380d2b5 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -147,7 +147,8 @@ static void __noreturn mpc7448_hpc2_restart(char *cmd) local_irq_disable(); /* Set exception prefix high - to the firmware */ - _nmask_and_or_msr(0, MSR_IP); + mtmsr(mfmsr() | MSR_IP); + isync(); for (;;) ; /* Spin until reset happens */ } diff --git a/arch/powerpc/platforms/embedded6xx/storcenter.c b/arch/powerpc/platforms/embedded6xx/storcenter.c index ed1914dd34bb..e346ddcef45e 100644 --- a/arch/powerpc/platforms/embedded6xx/storcenter.c +++ b/arch/powerpc/platforms/embedded6xx/storcenter.c @@ -101,7 +101,8 @@ static void __noreturn storcenter_restart(char *cmd) local_irq_disable(); /* Set exception prefix high - to the firmware */ - _nmask_and_or_msr(0, MSR_IP); + mtmsr(mfmsr() | MSR_IP); + isync(); /* Wait for reset to happen */ for (;;) ; diff --git a/arch/powerpc/platforms/powermac/pmac.h b/arch/powerpc/platforms/powermac/pmac.h index 16a52afdb76e..0d715db434dc 100644 --- a/arch/powerpc/platforms/powermac/pmac.h +++ b/arch/powerpc/platforms/powermac/pmac.h @@ -34,7 +34,7 @@ extern void pmac_check_ht_link(void); extern void pmac_setup_smp(void); extern int psurge_secondary_virq; -extern void low_cpu_die(void) __attribute__((noreturn)); +extern void low_cpu_offline_self(void) __attribute__((noreturn)); extern int pmac_nvram_init(void); extern void pmac_pic_init(void); diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index f002b0fa69b8..2e2cc0c75d87 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -284,7 +284,7 @@ static void __init pmac_setup_arch(void) /* 604, G3, G4 etc. */ loops_per_jiffy = *fp / HZ; else - /* 601, 603, etc. */ + /* 603, etc. */ loops_per_jiffy = *fp / (2 * HZ); of_node_put(cpu); break; diff --git a/arch/powerpc/platforms/powermac/sleep.S b/arch/powerpc/platforms/powermac/sleep.S index f9a680fdd9c4..7e0f8ba6e54a 100644 --- a/arch/powerpc/platforms/powermac/sleep.S +++ b/arch/powerpc/platforms/powermac/sleep.S @@ -201,8 +201,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) addi r3,r3,sleep_storage@l stw r5,0(r3) - .globl low_cpu_die -low_cpu_die: + .globl low_cpu_offline_self +low_cpu_offline_self: /* Flush & disable all caches */ bl flush_disable_caches @@ -244,7 +244,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) mtmsr r2 isync b 1b -_ASM_NOKPROBE_SYMBOL(low_cpu_die) +_ASM_NOKPROBE_SYMBOL(low_cpu_offline_self) /* * Here is the resume code. */ @@ -294,14 +294,7 @@ grackle_wake_up: * we do any r1 memory access as we are not sure they * are in a sane state above the first 256Mb region */ - li r0,16 /* load up segment register values */ - mtctr r0 /* for context 0 */ - lis r3,0x2000 /* Ku = 1, VSID = 0 */ - li r4,0 -3: mtsrin r3,r4 - addi r3,r3,0x111 /* increment VSID */ - addis r4,r4,0x1000 /* address of next segment */ - bdnz 3b + bl load_segment_registers sync isync diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index eb23264910e1..74ebe664b016 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -270,10 +270,6 @@ static void __init smp_psurge_probe(void) int i, ncpus; struct device_node *dn; - /* We don't do SMP on the PPC601 -- paulus */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) - return; - /* * The powersurge cpu board can be used in the generation * of powermacs that have a socket for an upgradeable cpu card, @@ -920,7 +916,7 @@ static int smp_core99_cpu_disable(void) #ifdef CONFIG_PPC32 -static void pmac_cpu_die(void) +static void pmac_cpu_offline_self(void) { int cpu = smp_processor_id(); @@ -930,12 +926,12 @@ static void pmac_cpu_die(void) generic_set_cpu_dead(cpu); smp_wmb(); mb(); - low_cpu_die(); + low_cpu_offline_self(); } #else /* CONFIG_PPC32 */ -static void pmac_cpu_die(void) +static void pmac_cpu_offline_self(void) { int cpu = smp_processor_id(); @@ -1020,7 +1016,7 @@ void __init pmac_setup_smp(void) #endif /* CONFIG_PPC_PMAC32_PSURGE */ #ifdef CONFIG_HOTPLUG_CPU - ppc_md.cpu_die = pmac_cpu_die; + smp_ops->cpu_offline_self = pmac_cpu_offline_self; #endif } diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 9af8c3b98853..89e22c460ebf 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -38,60 +38,12 @@ static int eeh_event_irq = -EINVAL; -void pnv_pcibios_bus_add_device(struct pci_dev *pdev) +static void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_probe_device(pdev); } -static int pnv_eeh_init(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; - - if (!firmware_has_feature(FW_FEATURE_OPAL)) { - pr_warn("%s: OPAL is required !\n", - __func__); - return -EINVAL; - } - - /* Set probe mode */ - eeh_add_flag(EEH_PROBE_MODE_DEV); - - /* - * P7IOC blocks PCI config access to frozen PE, but PHB3 - * doesn't do that. So we have to selectively enable I/O - * prior to collecting error log. - */ - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - - if (phb->model == PNV_PHB_MODEL_P7IOC) - eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); - - if (phb->diag_data_size > max_diag_size) - max_diag_size = phb->diag_data_size; - - /* - * PE#0 should be regarded as valid by EEH core - * if it's not the reserved one. Currently, we - * have the reserved PE#255 and PE#127 for PHB3 - * and P7IOC separately. So we should regard - * PE#0 as valid for PHB3 and P7IOC. - */ - if (phb->ioda.reserved_pe_idx != 0) - eeh_add_flag(EEH_VALID_PE_ZERO); - - break; - } - - eeh_set_pe_aux_size(max_diag_size); - ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device; - - return 0; -} - static irqreturn_t pnv_eeh_event(int irq, void *data) { /* @@ -135,7 +87,7 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, return -EINVAL; /* Retrieve PE */ - pe = eeh_pe_get(hose, pe_no, 0); + pe = eeh_pe_get(hose, pe_no); if (!pe) return -ENODEV; @@ -190,7 +142,7 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); #endif /* CONFIG_DEBUG_FS */ -void pnv_eeh_enable_phbs(void) +static void pnv_eeh_enable_phbs(void) { struct pci_controller *hose; struct pnv_phb *phb; @@ -354,7 +306,7 @@ static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev) if (parent) { struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent); - return eeh_pe_get(phb->hose, ioda_pe->pe_number, 0); + return eeh_pe_get(phb->hose, ioda_pe->pe_number); } return NULL; @@ -1406,7 +1358,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, } /* Find the PE according to PE# */ - dev_pe = eeh_pe_get(hose, pe_no, 0); + dev_pe = eeh_pe_get(hose, pe_no); if (!dev_pe) return -EEXIST; @@ -1674,7 +1626,6 @@ static int pnv_eeh_restore_config(struct eeh_dev *edev) static struct eeh_ops pnv_eeh_ops = { .name = "powernv", - .init = pnv_eeh_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, .get_state = pnv_eeh_get_state, @@ -1715,9 +1666,44 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps); */ static int __init eeh_powernv_init(void) { + int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; + struct pci_controller *hose; + struct pnv_phb *phb; int ret = -EINVAL; - ret = eeh_ops_register(&pnv_eeh_ops); + if (!firmware_has_feature(FW_FEATURE_OPAL)) { + pr_warn("%s: OPAL is required !\n", __func__); + return -EINVAL; + } + + /* Set probe mode */ + eeh_add_flag(EEH_PROBE_MODE_DEV); + + /* + * P7IOC blocks PCI config access to frozen PE, but PHB3 + * doesn't do that. So we have to selectively enable I/O + * prior to collecting error log. + */ + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_P7IOC) + eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + + if (phb->diag_data_size > max_diag_size) + max_diag_size = phb->diag_data_size; + + break; + } + + /* + * eeh_init() allocates the eeh_pe and its aux data buf so the + * size needs to be set before calling eeh_init(). + */ + eeh_set_pe_aux_size(max_diag_size); + ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device; + + ret = eeh_init(&pnv_eeh_ops); if (!ret) pr_info("EEH: PowerNV platform initialized\n"); else @@ -1725,4 +1711,4 @@ static int __init eeh_powernv_init(void) return ret; } -machine_early_initcall(powernv, eeh_powernv_init); +machine_arch_initcall(powernv, eeh_powernv_init); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 345ab062b21a..1ed7c5286487 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -565,7 +565,7 @@ void power7_idle_type(unsigned long type) irq_set_pending_from_srr1(srr1); } -void power7_idle(void) +static void power7_idle(void) { if (!powersave_nap) return; @@ -659,20 +659,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) mmcr0 = mfspr(SPRN_MMCR0); } - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - /* - * POWER10 uses MMCRA (BHRBRD) as BHRB disable bit. - * If the user hasn't asked for the BHRB to be - * written, the value of MMCRA[BHRBRD] is 1. - * On wakeup from stop, MMCRA[BHRBD] will be 0, - * since it is previleged resource and will be lost. - * Thus, if we do not save and restore the MMCRA[BHRBD], - * hardware will be needlessly writing to the BHRB - * in problem mode. - */ - mmcra = mfspr(SPRN_MMCRA); - } - if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) { sprs.lpcr = mfspr(SPRN_LPCR); sprs.hfscr = mfspr(SPRN_HFSCR); @@ -735,10 +721,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) mtspr(SPRN_MMCR0, mmcr0); } - /* Reload MMCRA to restore BHRB disable bit for POWER10 */ - if (cpu_has_feature(CPU_FTR_ARCH_31)) - mtspr(SPRN_MMCRA, mmcra); - /* * DD2.2 and earlier need to set then clear bit 60 in MMCRA * to ensure the PMU starts running. @@ -823,73 +805,6 @@ out: return srr1; } -#ifdef CONFIG_HOTPLUG_CPU -static unsigned long power9_offline_stop(unsigned long psscr) -{ - unsigned long srr1; - -#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE - __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); - __ppc64_runlatch_on(); -#else - /* - * Tell KVM we're entering idle. - * This does not have to be done in real mode because the P9 MMU - * is independent per-thread. Some steppings share radix/hash mode - * between threads, but in that case KVM has a barrier sync in real - * mode before and after switching between radix and hash. - * - * kvm_start_guest must still be called in real mode though, hence - * the false argument. - */ - local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; - - __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, false); - __ppc64_runlatch_on(); - - local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; - /* Order setting hwthread_state vs. testing hwthread_req */ - smp_mb(); - if (local_paca->kvm_hstate.hwthread_req) - srr1 = idle_kvm_start_guest(srr1); - mtmsr(MSR_KERNEL); -#endif - - return srr1; -} -#endif - -void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) -{ - unsigned long psscr; - unsigned long srr1; - - if (!prep_irq_for_idle_irqsoff()) - return; - - psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; - - __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); - __ppc64_runlatch_on(); - - fini_irq_for_idle_irqsoff(); - - irq_set_pending_from_srr1(srr1); -} - -/* - * Used for ppc_md.power_save which needs a function with no parameters - */ -void power9_idle(void) -{ - power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); -} - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* * This is used in working around bugs in thread reconfiguration @@ -962,6 +877,198 @@ void pnv_power9_force_smt4_release(void) EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ +struct p10_sprs { + /* + * SPRs that get lost in shallow states: + * + * P10 loses CR, LR, CTR, FPSCR, VSCR, XER, TAR, SPRG2, and HSPRG1 + * isa300 idle routines restore CR, LR. + * CTR is volatile + * idle thread doesn't use FP or VEC + * kernel doesn't use TAR + * HSPRG1 is only live in HV interrupt entry + * SPRG2 is only live in KVM guests, KVM handles it. + */ +}; + +static unsigned long power10_idle_stop(unsigned long psscr, bool mmu_on) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long core_thread_mask = (1UL << threads_per_core) - 1; + unsigned long srr1; + unsigned long pls; +// struct p10_sprs sprs = {}; /* avoid false used-uninitialised */ + bool sprs_saved = false; + + if (!(psscr & (PSSCR_EC|PSSCR_ESL))) { + /* EC=ESL=0 case */ + + BUG_ON(!mmu_on); + + /* + * Wake synchronously. SRESET via xscom may still cause + * a 0x100 powersave wakeup with SRR1 reason! + */ + srr1 = isa300_idle_stop_noloss(psscr); /* go idle */ + if (likely(!srr1)) + return 0; + + /* + * Registers not saved, can't recover! + * This would be a hardware bug + */ + BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS); + + goto out; + } + + /* EC=ESL=1 case */ + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) { + /* XXX: save SPRs for deep state loss here. */ + + sprs_saved = true; + + atomic_start_thread_idle(); + } + + srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ + + psscr = mfspr(SPRN_PSSCR); + + WARN_ON_ONCE(!srr1); + WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); + + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) + hmi_exception_realmode(NULL); + + /* + * On POWER10, SRR1 bits do not match exactly as expected. + * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so + * just always test PSSCR for SPR/TB state loss. + */ + pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT; + if (likely(pls < deep_spr_loss_state)) { + if (sprs_saved) + atomic_stop_thread_idle(); + goto out; + } + + /* HV state loss */ + BUG_ON(!sprs_saved); + + atomic_lock_thread_idle(); + + if ((*state & core_thread_mask) != 0) + goto core_woken; + + /* XXX: restore per-core SPRs here */ + + if (pls >= pnv_first_tb_loss_level) { + /* TB loss */ + if (opal_resync_timebase() != OPAL_SUCCESS) + BUG(); + } + + /* + * isync after restoring shared SPRs and before unlocking. Unlock + * only contains hwsync which does not necessarily do the right + * thing for SPRs. + */ + isync(); + +core_woken: + atomic_unlock_and_stop_thread_idle(); + + /* XXX: restore per-thread SPRs here */ + + if (!radix_enabled()) + __slb_restore_bolted_realmode(); + +out: + if (mmu_on) + mtmsr(MSR_KERNEL); + + return srr1; +} + +#ifdef CONFIG_HOTPLUG_CPU +static unsigned long arch300_offline_stop(unsigned long psscr) +{ + unsigned long srr1; + +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE + __ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_31)) + srr1 = power10_idle_stop(psscr, true); + else + srr1 = power9_idle_stop(psscr, true); + __ppc64_runlatch_on(); +#else + /* + * Tell KVM we're entering idle. + * This does not have to be done in real mode because the P9 MMU + * is independent per-thread. Some steppings share radix/hash mode + * between threads, but in that case KVM has a barrier sync in real + * mode before and after switching between radix and hash. + * + * kvm_start_guest must still be called in real mode though, hence + * the false argument. + */ + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; + + __ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_31)) + srr1 = power10_idle_stop(psscr, false); + else + srr1 = power9_idle_stop(psscr, false); + __ppc64_runlatch_on(); + + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; + /* Order setting hwthread_state vs. testing hwthread_req */ + smp_mb(); + if (local_paca->kvm_hstate.hwthread_req) + srr1 = idle_kvm_start_guest(srr1); + mtmsr(MSR_KERNEL); +#endif + + return srr1; +} +#endif + +void arch300_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long psscr; + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + __ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_31)) + srr1 = power10_idle_stop(psscr, true); + else + srr1 = power9_idle_stop(psscr, true); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + irq_set_pending_from_srr1(srr1); +} + +/* + * Used for ppc_md.power_save which needs a function with no parameters + */ +static void arch300_idle(void) +{ + arch300_idle_type(pnv_default_stop_val, pnv_default_stop_mask); +} + #ifdef CONFIG_HOTPLUG_CPU void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) @@ -995,7 +1102,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) psscr = mfspr(SPRN_PSSCR); psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | pnv_deepest_stop_psscr_val; - srr1 = power9_offline_stop(psscr); + srr1 = arch300_offline_stop(psscr); } else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) { srr1 = power7_offline(); } else { @@ -1093,11 +1200,15 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) * @dt_idle_states: Number of idle state entries * Returns 0 on success */ -static void __init pnv_power9_idle_init(void) +static void __init pnv_arch300_idle_init(void) { u64 max_residency_ns = 0; int i; + /* stop is not really architected, we only have p9,p10 drivers */ + if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9)) + return; + /* * pnv_deepest_stop_{val,mask} should be set to values corresponding to * the deepest stop state. @@ -1112,6 +1223,11 @@ static void __init pnv_power9_idle_init(void) struct pnv_idle_states_t *state = &pnv_idle_states[i]; u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK; + /* No deep loss driver implemented for POWER10 yet */ + if (pvr_version_is(PVR_POWER10) && + state->flags & (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT)) + continue; + if ((state->flags & OPAL_PM_TIMEBASE_STOP) && (pnv_first_tb_loss_level > psscr_rl)) pnv_first_tb_loss_level = psscr_rl; @@ -1162,7 +1278,7 @@ static void __init pnv_power9_idle_init(void) if (unlikely(!default_stop_found)) { pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); } else { - ppc_md.power_save = power9_idle; + ppc_md.power_save = arch300_idle; pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_default_stop_val, pnv_default_stop_mask); } @@ -1224,7 +1340,7 @@ static void __init pnv_probe_idle_states(void) } if (cpu_has_feature(CPU_FTR_ARCH_300)) - pnv_power9_idle_init(); + pnv_arch300_idle_init(); for (i = 0; i < nr_pnv_idle_states; i++) supported_cpuidle_states |= pnv_idle_states[i].flags; @@ -1295,7 +1411,7 @@ static int pnv_parse_cpuidle_dt(void) for (i = 0; i < nr_idle_states; i++) pnv_idle_states[i].residency_ns = temp_u32[i]; - /* For power9 */ + /* For power9 and later */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { /* Read pm_crtl_val */ if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr", @@ -1358,8 +1474,8 @@ static int __init pnv_init_idle_states(void) if (!cpu_has_feature(CPU_FTR_ARCH_300)) { /* P7/P8 nap */ p->thread_idle_state = PNV_THREAD_RUNNING; - } else { - /* P9 stop */ + } else if (pvr_version_is(PVR_POWER9)) { + /* P9 stop workarounds */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE p->requested_psscr = 0; atomic_set(&p->dont_stop, 0); diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 13b369d2cc45..6828108486f8 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -224,7 +224,7 @@ static int memtrace_online(void) ent->mem = 0; } - if (add_memory(ent->nid, ent->start, ent->size)) { + if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) { pr_err("Failed to add trace memory to node %d\n", ent->nid); ret += 1; diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c index 8c65aacda9c8..ecdad219d704 100644 --- a/arch/powerpc/platforms/powernv/ocxl.c +++ b/arch/powerpc/platforms/powernv/ocxl.c @@ -2,7 +2,6 @@ // Copyright 2017 IBM Corp. #include <asm/pnv-ocxl.h> #include <asm/opal.h> -#include <asm/xive.h> #include <misc/ocxl-config.h> #include "pci.h" @@ -484,32 +483,3 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle) return rc; } EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache); - -int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr) -{ - __be64 flags, trigger_page; - s64 rc; - u32 hwirq; - - hwirq = xive_native_alloc_irq(); - if (!hwirq) - return -ENOENT; - - rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL, - NULL); - if (rc || !trigger_page) { - xive_native_free_irq(hwirq); - return -ENOENT; - } - *irq = hwirq; - *trigger_addr = be64_to_cpu(trigger_page); - return 0; - -} -EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq); - -void pnv_ocxl_free_xive_irq(u32 irq) -{ - xive_native_free_irq(irq); -} -EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq); diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 6dba3b62269f..23571f0b555a 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -510,7 +510,7 @@ static void __init opalcore_config_init(void) idx = be32_to_cpu(opalc_metadata->region_cnt); if (idx > MAX_PT_LOAD_CNT) { pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)", - MAX_PT_LOAD_CNT, idx); + idx, MAX_PT_LOAD_CNT); idx = MAX_PT_LOAD_CNT; } for (i = 0; i < idx; i++) { diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c index 62ef7ad995da..5e33b1fc67c2 100644 --- a/arch/powerpc/platforms/powernv/opal-elog.c +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -179,14 +179,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, return count; } -static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) +static void create_elog_obj(uint64_t id, size_t size, uint64_t type) { struct elog_obj *elog; int rc; elog = kzalloc(sizeof(*elog), GFP_KERNEL); if (!elog) - return NULL; + return; elog->kobj.kset = elog_kset; @@ -219,18 +219,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) rc = kobject_add(&elog->kobj, NULL, "0x%llx", id); if (rc) { kobject_put(&elog->kobj); - return NULL; + return; } + /* + * As soon as the sysfs file for this elog is created/activated there is + * a chance the opal_errd daemon (or any userspace) might read and + * acknowledge the elog before kobject_uevent() is called. If that + * happens then there is a potential race between + * elog_ack_store->kobject_put() and kobject_uevent() which leads to a + * use-after-free of a kernfs object resulting in a kernel crash. + * + * To avoid that, we need to take a reference on behalf of the bin file, + * so that our reference remains valid while we call kobject_uevent(). + * We then drop our reference before exiting the function, leaving the + * bin file to drop the last reference (if it hasn't already). + */ + + /* Take a reference for the bin file */ + kobject_get(&elog->kobj); rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr); - if (rc) { + if (rc == 0) { + kobject_uevent(&elog->kobj, KOBJ_ADD); + } else { + /* Drop the reference taken for the bin file */ kobject_put(&elog->kobj); - return NULL; } - kobject_uevent(&elog->kobj, KOBJ_ADD); + /* Drop our reference */ + kobject_put(&elog->kobj); - return elog; + return; } static irqreturn_t elog_event(int irq, void *data) diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index d26da19a611f..d3b6e135c18b 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -12,6 +12,8 @@ #include <linux/types.h> #include <asm/barrier.h> +#include "powernv.h" + /* OPAL in-memory console. Defined in OPAL source at core/console.c */ struct memcons { __be64 magic; diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 45f4223a790f..deddaebf8c14 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -24,7 +24,7 @@ #include <linux/uaccess.h> -/** +/* * The msg member must be at the end of the struct, as it's followed by the * message data. */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 023a4f987bb2..2b4ceb5e6ce4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -894,7 +894,6 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; long rc, rid_end, rid; @@ -904,7 +903,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; - parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) count = resource_size(&pe->pbus->busn_res); else @@ -925,12 +923,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) } rid_end = pe->rid + (count << 8); } else { -#ifdef CONFIG_PCI_IOV - if (pe->flags & PNV_IODA_PE_VF) - parent = pe->parent_dev; - else -#endif /* CONFIG_PCI_IOV */ - parent = pe->pdev->bus->self; bcomp = OpalPciBusAll; dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 1aa51c4fa904..11df4e16a1cc 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -2,6 +2,13 @@ #ifndef _POWERNV_H #define _POWERNV_H +/* + * There's various hacks scattered throughout the generic powerpc arch code + * that needs to call into powernv platform stuff. The prototypes for those + * functions are in asm/powernv.h + */ +#include <asm/powernv.h> + #ifdef CONFIG_SMP extern void pnv_smp_init(void); #else diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 8035caf6e297..72c25295c1c2 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -65,7 +65,7 @@ int powernv_get_random_real_mode(unsigned long *v) return 1; } -int powernv_get_random_darn(unsigned long *v) +static int powernv_get_random_darn(unsigned long *v) { unsigned long val; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 7fcb88623081..9acaa0f131b9 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -130,6 +130,28 @@ static void pnv_setup_rfi_flush(void) setup_count_cache_flush(); } +static void __init pnv_check_guarded_cores(void) +{ + struct device_node *dn; + int bad_count = 0; + + for_each_node_by_type(dn, "cpu") { + if (of_property_match_string(dn, "status", "bad") >= 0) + bad_count++; + }; + + if (bad_count) { + printk(" _ _______________\n"); + pr_cont(" | | / \\\n"); + pr_cont(" | | | WARNING! |\n"); + pr_cont(" | | | |\n"); + pr_cont(" | | | It looks like |\n"); + pr_cont(" |_| | you have %*d |\n", 3, bad_count); + pr_cont(" _ | guarded cores |\n"); + pr_cont(" (_) \\_______________/\n"); + } +} + static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -150,6 +172,8 @@ static void __init pnv_setup_arch(void) /* Enable NAP mode */ powersave_nap = 1; + pnv_check_guarded_cores(); + /* XXX PMCS */ } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index b2ba3e95bda7..54c4ba45c7ce 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -43,7 +43,7 @@ #include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else -#define DBG(fmt...) +#define DBG(fmt...) do { } while (0) #endif static void pnv_smp_setup_cpu(int cpu) @@ -158,7 +158,7 @@ static void pnv_flush_interrupts(void) } } -static void pnv_smp_cpu_kill_self(void) +static void pnv_cpu_offline_self(void) { unsigned long srr1, unexpected_mask, wmask; unsigned int cpu; @@ -417,6 +417,7 @@ static struct smp_ops_t pnv_smp_ops = { #ifdef CONFIG_HOTPLUG_CPU .cpu_disable = pnv_smp_cpu_disable, .cpu_die = generic_cpu_die, + .cpu_offline_self = pnv_cpu_offline_self, #endif /* CONFIG_HOTPLUG_CPU */ }; @@ -430,7 +431,6 @@ void __init pnv_smp_init(void) smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU - ppc_md.cpu_die = pnv_smp_cpu_kill_self; #ifdef CONFIG_KEXEC_CORE crash_wake_offline = 1; #endif diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 6434f9cb5aed..5f5fe63a3d1c 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -186,7 +186,7 @@ static void unmap_winctx_mmio_bars(struct vas_window *window) * OS/User Window Context (UWC) MMIO Base Address Region for the given window. * Map these bus addresses and save the mapped kernel addresses in @window. */ -int map_winctx_mmio_bars(struct vas_window *window) +static int map_winctx_mmio_bars(struct vas_window *window) { int len; u64 start; @@ -214,7 +214,7 @@ int map_winctx_mmio_bars(struct vas_window *window) * registers are not sequential. And, we can only write to offsets * with valid registers. */ -void reset_window_regs(struct vas_window *window) +static void reset_window_regs(struct vas_window *window) { write_hvwc_reg(window, VREG(LPID), 0ULL); write_hvwc_reg(window, VREG(PID), 0ULL); @@ -357,7 +357,8 @@ static void init_rsvd_tx_buf_count(struct vas_window *txwin, * as a one-time task? That could work for NX but what about other * receivers? Let the receivers tell us the rx-fifo buffers for now. */ -int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) +static void init_winctx_regs(struct vas_window *window, + struct vas_winctx *winctx) { u64 val; int fifo_size; @@ -499,8 +500,6 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win); val = SET_FIELD(VAS_WINCTL_OPEN, val, 1); write_hvwc_reg(window, VREG(WINCTL), val); - - return 0; } static void vas_release_window_id(struct ida *ida, int winid) diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index 1193c294b8d0..0c252478e556 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -448,7 +448,7 @@ static void ps3_disable_spu(struct spu_context *ctx) ctx->ops->runcntl_stop(ctx); } -const struct spu_management_ops spu_management_ps3_ops = { +static const struct spu_management_ops spu_management_ps3_ops = { .enumerate_spus = ps3_enumerate_spus, .create_spu = ps3_create_spu, .destroy_spu = ps3_destroy_spu, @@ -589,7 +589,7 @@ static u64 resource_allocation_enable_get(struct spu *spu) return 0; /* No support. */ } -const struct spu_priv1_ops spu_priv1_ps3_ops = { +static const struct spu_priv1_ops spu_priv1_ps3_ops = { .int_mask_and = int_mask_and, .int_mask_or = int_mask_or, .int_mask_set = int_mask_set, diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 3542b7bd6a46..c62aaa29a9d5 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -9,7 +9,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/export.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/err.h> #include <linux/slab.h> @@ -696,6 +696,8 @@ static const struct dma_map_ops ps3_sb_dma_ops = { .unmap_page = ps3_unmap_page, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; static const struct dma_map_ops ps3_ioc0_dma_ops = { @@ -708,6 +710,8 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = { .unmap_page = ps3_unmap_page, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; /** diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index cb2d9a970b7b..cf024fa37bda 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -33,8 +33,6 @@ #include <asm/ppc-pci.h> #include <asm/rtas.h> -static int pseries_eeh_get_pe_addr(struct pci_dn *pdn); - /* RTAS tokens */ static int ibm_set_eeh_option; static int ibm_set_slot_reset; @@ -86,42 +84,43 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) /** - * pseries_eeh_get_config_addr - Retrieve config address + * pseries_eeh_get_pe_config_addr - Find the pe_config_addr for a device + * @pdn: pci_dn of the input device + * + * The EEH RTAS calls use a tuple consisting of: (buid_hi, buid_lo, + * pe_config_addr) as a handle to a given PE. This function finds the + * pe_config_addr based on the device's config addr. * - * Retrieve the assocated config address. Actually, there're 2 RTAS - * function calls dedicated for the purpose. We need implement - * it through the new function and then the old one. Besides, - * you should make sure the config address is figured out from - * FDT node before calling the function. + * Keep in mind that the pe_config_addr *might* be numerically identical to the + * device's config addr, but the two are conceptually distinct. * - * It's notable that zero'ed return value means invalid PE config - * address. + * Returns the pe_config_addr, or a negative error code. */ -static int pseries_eeh_get_config_addr(struct pci_controller *phb, int config_addr) +static int pseries_eeh_get_pe_config_addr(struct pci_dn *pdn) { - int ret = 0; - int rets[3]; + int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); + struct pci_controller *phb = pdn->phb; + int ret, rets[3]; if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { /* - * First of all, we need to make sure there has one PE - * associated with the device. Otherwise, PE address is - * meaningless. + * First of all, use function 1 to determine if this device is + * part of a PE or not. ret[0] being zero indicates it's not. */ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, config_addr, BUID_HI(phb->buid), BUID_LO(phb->buid), 1); if (ret || (rets[0] == 0)) - return 0; + return -ENOENT; - /* Retrieve the associated PE config address */ + /* Retrieve the associated PE config address with function 0 */ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, config_addr, BUID_HI(phb->buid), BUID_LO(phb->buid), 0); if (ret) { pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", __func__, phb->global_number, config_addr); - return 0; + return -ENXIO; } return rets[0]; @@ -134,13 +133,20 @@ static int pseries_eeh_get_config_addr(struct pci_controller *phb, int config_ad if (ret) { pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", __func__, phb->global_number, config_addr); - return 0; + return -ENXIO; } return rets[0]; } - return ret; + /* + * PAPR does describe a process for finding the pe_config_addr that was + * used before the ibm,get-config-addr-info calls were added. However, + * I haven't found *any* systems that don't have that RTAS call + * implemented. If you happen to find one that needs the old DT based + * process, patches are welcome! + */ + return -ENOENT; } /** @@ -161,8 +167,7 @@ static int pseries_eeh_phb_reset(struct pci_controller *phb, int config_addr, in BUID_LO(phb->buid), option); /* If fundamental-reset not supported, try hot-reset */ - if (option == EEH_RESET_FUNDAMENTAL && - ret == -8) { + if (option == EEH_RESET_FUNDAMENTAL && ret == -8) { option = EEH_RESET_HOT; ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, config_addr, BUID_HI(phb->buid), @@ -170,8 +175,7 @@ static int pseries_eeh_phb_reset(struct pci_controller *phb, int config_addr, in } /* We need reset hold or settlement delay */ - if (option == EEH_RESET_FUNDAMENTAL || - option == EEH_RESET_HOT) + if (option == EEH_RESET_FUNDAMENTAL || option == EEH_RESET_HOT) msleep(EEH_PE_RST_HOLD_TIME); else msleep(EEH_PE_RST_SETTLE_TIME); @@ -239,88 +243,6 @@ static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; static DEFINE_SPINLOCK(slot_errbuf_lock); static int eeh_error_buf_size; -/** - * pseries_eeh_init - EEH platform dependent initialization - * - * EEH platform dependent initialization on pseries. - */ -static int pseries_eeh_init(void) -{ - struct pci_controller *phb; - struct pci_dn *pdn; - int addr, config_addr; - - /* figure out EEH RTAS function call tokens */ - ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); - ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); - ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); - ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); - ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); - ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); - ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); - ibm_configure_pe = rtas_token("ibm,configure-pe"); - - /* - * ibm,configure-pe and ibm,configure-bridge have the same semantics, - * however ibm,configure-pe can be faster. If we can't find - * ibm,configure-pe then fall back to using ibm,configure-bridge. - */ - if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE) - ibm_configure_pe = rtas_token("ibm,configure-bridge"); - - /* - * Necessary sanity check. We needn't check "get-config-addr-info" - * and its variant since the old firmware probably support address - * of domain/bus/slot/function for EEH RTAS operations. - */ - if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE || - ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE || - (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && - ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) || - ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE || - ibm_configure_pe == RTAS_UNKNOWN_SERVICE) { - pr_info("EEH functionality not supported\n"); - return -EINVAL; - } - - /* Initialize error log lock and size */ - spin_lock_init(&slot_errbuf_lock); - eeh_error_buf_size = rtas_token("rtas-error-log-max"); - if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { - pr_info("%s: unknown EEH error log size\n", - __func__); - eeh_error_buf_size = 1024; - } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { - pr_info("%s: EEH error log size %d exceeds the maximal %d\n", - __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX); - eeh_error_buf_size = RTAS_ERROR_LOG_MAX; - } - - /* Set EEH probe mode */ - eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); - - /* Set EEH machine dependent code */ - ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; - - if (is_kdump_kernel() || reset_devices) { - pr_info("Issue PHB reset ...\n"); - list_for_each_entry(phb, &hose_list, list_node) { - pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list); - addr = (pdn->busno << 16) | (pdn->devfn << 8); - config_addr = pseries_eeh_get_config_addr(phb, addr); - /* invalid PE config addr */ - if (config_addr == 0) - continue; - - pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_FUNDAMENTAL); - pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_DEACTIVATE); - pseries_eeh_phb_configure_bridge(phb, config_addr); - } - } - - return 0; -} - static int pseries_eeh_cap_start(struct pci_dn *pdn) { u32 status; @@ -439,10 +361,9 @@ static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev) */ void pseries_eeh_init_edev(struct pci_dn *pdn) { + struct eeh_pe pe, *parent; struct eeh_dev *edev; - struct eeh_pe pe; u32 pcie_flags; - int enable = 0; int ret; if (WARN_ON_ONCE(!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))) @@ -499,51 +420,38 @@ void pseries_eeh_init_edev(struct pci_dn *pdn) } } - /* Initialize the fake PE */ + /* first up, find the pe_config_addr for the PE containing the device */ + ret = pseries_eeh_get_pe_config_addr(pdn); + if (ret < 0) { + eeh_edev_dbg(edev, "Unable to find pe_config_addr\n"); + goto err; + } + + /* Try enable EEH on the fake PE */ memset(&pe, 0, sizeof(struct eeh_pe)); pe.phb = pdn->phb; - pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + pe.addr = ret; - /* Enable EEH on the device */ eeh_edev_dbg(edev, "Enabling EEH on device\n"); ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); if (ret) { eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); - } else { - struct eeh_pe *parent; + goto err; + } - /* Retrieve PE address */ - edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn); - pe.addr = edev->pe_config_addr; + edev->pe_config_addr = pe.addr; - /* Some older systems (Power4) allow the ibm,set-eeh-option - * call to succeed even on nodes where EEH is not supported. - * Verify support explicitly. - */ - ret = eeh_ops->get_state(&pe, NULL); - if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT) - enable = 1; + eeh_add_flag(EEH_ENABLED); - /* - * This device doesn't support EEH, but it may have an - * EEH parent. In this case any error on the device will - * freeze the PE of it's upstream bridge, so added it to - * the upstream PE. - */ - parent = pseries_eeh_pe_get_parent(edev); - if (parent && !enable) - edev->pe_config_addr = parent->addr; + parent = pseries_eeh_pe_get_parent(edev); + eeh_pe_tree_insert(edev, parent); + eeh_save_bars(edev); + eeh_edev_dbg(edev, "EEH enabled for device"); - if (enable || parent) { - eeh_add_flag(EEH_ENABLED); - eeh_pe_tree_insert(edev, parent); - } - eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", - (enable ? "enabled" : "unsupported"), ret); - } + return; - /* Save memory bars */ - eeh_save_bars(edev); +err: + eeh_edev_dbg(edev, "EEH is unsupported on device (code = %d)\n", ret); } static struct eeh_dev *pseries_eeh_probe(struct pci_dev *pdev) @@ -600,7 +508,6 @@ EXPORT_SYMBOL_GPL(pseries_eeh_init_edev_recursive); static int pseries_eeh_set_option(struct eeh_pe *pe, int option) { int ret = 0; - int config_addr; /* * When we're enabling or disabling EEH functioality on @@ -613,85 +520,23 @@ static int pseries_eeh_set_option(struct eeh_pe *pe, int option) case EEH_OPT_ENABLE: case EEH_OPT_THAW_MMIO: case EEH_OPT_THAW_DMA: - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; break; case EEH_OPT_FREEZE_PE: /* Not support */ return 0; default: - pr_err("%s: Invalid option %d\n", - __func__, option); + pr_err("%s: Invalid option %d\n", __func__, option); return -EINVAL; } ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), + pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid), option); return ret; } /** - * pseries_eeh_get_pe_addr - Retrieve PE address - * @pe: EEH PE - * - * Retrieve the assocated PE address. Actually, there're 2 RTAS - * function calls dedicated for the purpose. We need implement - * it through the new function and then the old one. Besides, - * you should make sure the config address is figured out from - * FDT node before calling the function. - * - * It's notable that zero'ed return value means invalid PE config - * address. - */ -static int pseries_eeh_get_pe_addr(struct pci_dn *pdn) -{ - int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); - unsigned long buid = pdn->phb->buid; - int ret = 0; - int rets[3]; - - if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { - /* - * First of all, we need to make sure there has one PE - * associated with the device. Otherwise, PE address is - * meaningless. - */ - ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, - config_addr, BUID_HI(buid), BUID_LO(buid), 1); - if (ret || (rets[0] == 0)) - return 0; - - /* Retrieve the associated PE config address */ - ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, - config_addr, BUID_HI(buid), BUID_LO(buid), 0); - if (ret) { - pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", - __func__, pdn->phb->global_number, config_addr); - return 0; - } - - return rets[0]; - } - - if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { - ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, - config_addr, BUID_HI(buid), BUID_LO(buid), 0); - if (ret) { - pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", - __func__, pdn->phb->global_number, config_addr); - return 0; - } - - return rets[0]; - } - - return ret; -} - -/** * pseries_eeh_get_state - Retrieve PE state * @pe: EEH PE * @delay: suggested time to wait if state is unavailable @@ -706,25 +551,19 @@ static int pseries_eeh_get_pe_addr(struct pci_dn *pdn) */ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) { - int config_addr; int ret; int rets[4]; int result; - /* Figure out PE config address if possible */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets, - config_addr, BUID_HI(pe->phb->buid), + pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid)); } else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) { /* Fake PE unavailable info */ rets[2] = 0; ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets, - config_addr, BUID_HI(pe->phb->buid), + pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid)); } else { return EEH_STATE_NOT_SUPPORT; @@ -778,14 +617,7 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) */ static int pseries_eeh_reset(struct eeh_pe *pe, int option) { - int config_addr; - - /* Figure out PE address */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - - return pseries_eeh_phb_reset(pe->phb, config_addr, option); + return pseries_eeh_phb_reset(pe->phb, pe->addr, option); } /** @@ -801,19 +633,13 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option) */ static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len) { - int config_addr; unsigned long flags; int ret; spin_lock_irqsave(&slot_errbuf_lock, flags); memset(slot_errbuf, 0, eeh_error_buf_size); - /* Figure out the PE address */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - - ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr, + ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, pe->addr, BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid), virt_to_phys(drv_log), len, virt_to_phys(slot_errbuf), eeh_error_buf_size, @@ -832,14 +658,7 @@ static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, u */ static int pseries_eeh_configure_bridge(struct eeh_pe *pe) { - int config_addr; - - /* Figure out the PE address */ - config_addr = pe->config_addr; - if (pe->addr) - config_addr = pe->addr; - - return pseries_eeh_phb_configure_bridge(pe->phb, config_addr); + return pseries_eeh_phb_configure_bridge(pe->phb, pe->addr); } /** @@ -954,8 +773,7 @@ static int pseries_notify_resume(struct eeh_dev *edev) if (!edev) return -EEXIST; - if (rtas_token("ibm,open-sriov-allow-unfreeze") - == RTAS_UNKNOWN_SERVICE) + if (rtas_token("ibm,open-sriov-allow-unfreeze") == RTAS_UNKNOWN_SERVICE) return -EINVAL; if (edev->pdev->is_physfn || edev->pdev->is_virtfn) @@ -967,7 +785,6 @@ static int pseries_notify_resume(struct eeh_dev *edev) static struct eeh_ops pseries_eeh_ops = { .name = "pseries", - .init = pseries_eeh_init, .probe = pseries_eeh_probe, .set_option = pseries_eeh_set_option, .get_state = pseries_eeh_get_state, @@ -992,15 +809,84 @@ static struct eeh_ops pseries_eeh_ops = { */ static int __init eeh_pseries_init(void) { - int ret; + struct pci_controller *phb; + struct pci_dn *pdn; + int ret, config_addr; - ret = eeh_ops_register(&pseries_eeh_ops); + /* figure out EEH RTAS function call tokens */ + ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); + ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); + ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); + ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); + + /* + * ibm,configure-pe and ibm,configure-bridge have the same semantics, + * however ibm,configure-pe can be faster. If we can't find + * ibm,configure-pe then fall back to using ibm,configure-bridge. + */ + if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE) + ibm_configure_pe = rtas_token("ibm,configure-bridge"); + + /* + * Necessary sanity check. We needn't check "get-config-addr-info" + * and its variant since the old firmware probably support address + * of domain/bus/slot/function for EEH RTAS operations. + */ + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE || + ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE || + (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && + ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) || + ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE || + ibm_configure_pe == RTAS_UNKNOWN_SERVICE) { + pr_info("EEH functionality not supported\n"); + return -EINVAL; + } + + /* Initialize error log lock and size */ + spin_lock_init(&slot_errbuf_lock); + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + pr_info("%s: unknown EEH error log size\n", + __func__); + eeh_error_buf_size = 1024; + } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + pr_info("%s: EEH error log size %d exceeds the maximal %d\n", + __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + + /* Set EEH probe mode */ + eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); + + /* Set EEH machine dependent code */ + ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; + + if (is_kdump_kernel() || reset_devices) { + pr_info("Issue PHB reset ...\n"); + list_for_each_entry(phb, &hose_list, list_node) { + pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list); + config_addr = pseries_eeh_get_pe_config_addr(pdn); + + /* invalid PE config addr */ + if (config_addr < 0) + continue; + + pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_FUNDAMENTAL); + pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_DEACTIVATE); + pseries_eeh_phb_configure_bridge(phb, config_addr); + } + } + + ret = eeh_init(&pseries_eeh_ops); if (!ret) pr_info("EEH: pSeries platform initialized\n"); else pr_info("EEH: pSeries platform initialization failure (%d)\n", ret); - return ret; } -machine_early_initcall(pseries, eeh_pseries_init); +machine_arch_initcall(pseries, eeh_pseries_init); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7a974ed6b240..f2837e33bf5d 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -55,7 +55,7 @@ static void rtas_stop_self(void) panic("Alas, I survived.\n"); } -static void pseries_mach_cpu_die(void) +static void pseries_cpu_offline_self(void) { unsigned int hwcpu = hard_smp_processor_id(); @@ -102,7 +102,7 @@ static int pseries_cpu_disable(void) * to self-destroy so that the cpu-offline thread can send the CPU_DEAD * notifications. * - * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to + * OTOH, pseries_cpu_offline_self() is called by the @cpu when it wants to * self-destruct. */ static void pseries_cpu_die(unsigned int cpu) @@ -901,7 +901,7 @@ static int __init pseries_cpu_hotplug_init(void) return 0; } - ppc_md.cpu_die = pseries_mach_cpu_die; + smp_ops->cpu_offline_self = pseries_cpu_offline_self; smp_ops->cpu_disable = pseries_cpu_disable; smp_ops->cpu_die = pseries_cpu_die; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 5d545b78111f..7efe6ec5d14a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -30,12 +30,17 @@ unsigned long pseries_memory_block_size(void) np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (np) { - const __be64 *size; + int len; + int size_cells; + const __be32 *prop; - size = of_get_property(np, "ibm,lmb-size", NULL); - if (size) - memblock_size = be64_to_cpup(size); + size_cells = of_n_size_cells(np); + + prop = of_get_property(np, "ibm,lmb-size", &len); + if (prop && len >= size_cells * sizeof(__be32)) + memblock_size = of_read_number(prop, size_cells); of_node_put(np); + } else if (machine_is(pseries)) { /* This fallback really only applies to pseries */ unsigned int memzero_size = 0; @@ -277,7 +282,7 @@ static int dlpar_offline_lmb(struct drmem_lmb *lmb) return dlpar_change_lmb_state(lmb, false); } -static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) +static int pseries_remove_memblock(unsigned long base, unsigned long memblock_size) { unsigned long block_sz, start_pfn; int sections_per_block; @@ -308,10 +313,11 @@ out: static int pseries_remove_mem_node(struct device_node *np) { - const __be32 *regs; + const __be32 *prop; unsigned long base; - unsigned int lmb_size; + unsigned long lmb_size; int ret = -EINVAL; + int addr_cells, size_cells; /* * Check to see if we are actually removing memory @@ -322,12 +328,19 @@ static int pseries_remove_mem_node(struct device_node *np) /* * Find the base address and size of the memblock */ - regs = of_get_property(np, "reg", NULL); - if (!regs) + prop = of_get_property(np, "reg", NULL); + if (!prop) return ret; - base = be64_to_cpu(*(unsigned long *)regs); - lmb_size = be32_to_cpu(regs[3]); + addr_cells = of_n_addr_cells(np); + size_cells = of_n_size_cells(np); + + /* + * "reg" property represents (addr,size) tuple. + */ + base = of_read_number(prop, addr_cells); + prop += addr_cells; + lmb_size = of_read_number(prop, size_cells); pseries_remove_memblock(base, lmb_size); return 0; @@ -354,25 +367,32 @@ static int dlpar_add_lmb(struct drmem_lmb *); static int dlpar_remove_lmb(struct drmem_lmb *lmb) { + struct memory_block *mem_block; unsigned long block_sz; int rc; if (!lmb_is_removable(lmb)) return -EINVAL; + mem_block = lmb_to_memblock(lmb); + if (mem_block == NULL) + return -EINVAL; + rc = dlpar_offline_lmb(lmb); - if (rc) + if (rc) { + put_device(&mem_block->dev); return rc; + } block_sz = pseries_memory_block_size(); - __remove_memory(lmb->nid, lmb->base_addr, block_sz); + __remove_memory(mem_block->nid, lmb->base_addr, block_sz); + put_device(&mem_block->dev); /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); - lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; @@ -557,7 +577,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) #else static inline int pseries_remove_memblock(unsigned long base, - unsigned int memblock_size) + unsigned long memblock_size) { return -EOPNOTSUPP; } @@ -591,7 +611,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) static int dlpar_add_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int rc; + int nid, rc; if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; @@ -602,11 +622,15 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) return rc; } - lmb_set_nid(lmb); block_sz = memory_block_size_bytes(); + /* Find the node id for this LMB. Fake one if necessary. */ + nid = of_drconf_to_nid_single(lmb); + if (nid < 0 || !node_possible(nid)) + nid = first_online_node; + /* Add the memory */ - rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); + rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; @@ -614,9 +638,8 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(lmb->nid, lmb->base_addr, block_sz); + __remove_memory(nid, lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); - lmb_clear_nid(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; } @@ -878,10 +901,11 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) static int pseries_add_mem_node(struct device_node *np) { - const __be32 *regs; + const __be32 *prop; unsigned long base; - unsigned int lmb_size; + unsigned long lmb_size; int ret = -EINVAL; + int addr_cells, size_cells; /* * Check to see if we are actually adding memory @@ -892,12 +916,18 @@ static int pseries_add_mem_node(struct device_node *np) /* * Find the base and size of the memblock */ - regs = of_get_property(np, "reg", NULL); - if (!regs) + prop = of_get_property(np, "reg", NULL); + if (!prop) return ret; - base = be64_to_cpu(*(unsigned long *)regs); - lmb_size = be32_to_cpu(regs[3]); + addr_cells = of_n_addr_cells(np); + size_cells = of_n_size_cells(np); + /* + * "reg" property represents (addr,size) tuple. + */ + base = of_read_number(prop, addr_cells); + prop += addr_cells; + lmb_size = of_read_number(prop, size_cells); /* * Update memory region to represent the memory add diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index c40c62ec432e..2c59b4986ea5 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -70,31 +70,14 @@ static int hc_show(struct seq_file *m, void *p) return 0; } -static const struct seq_operations hcall_inst_seq_ops = { +static const struct seq_operations hcall_inst_sops = { .start = hc_start, .next = hc_next, .stop = hc_stop, .show = hc_show }; -static int hcall_inst_seq_open(struct inode *inode, struct file *file) -{ - int rc; - struct seq_file *seq; - - rc = seq_open(file, &hcall_inst_seq_ops); - seq = file->private_data; - seq->private = file_inode(file)->i_private; - - return rc; -} - -static const struct file_operations hcall_inst_seq_fops = { - .open = hcall_inst_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(hcall_inst); #define HCALL_ROOT_DIR "hcall_inst" #define CPU_NAME_BUF_SIZE 32 @@ -149,7 +132,7 @@ static int __init hcall_inst_init(void) snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu); debugfs_create_file(cpu_name_buf, 0444, hcall_root, per_cpu(hcall_stats, cpu), - &hcall_inst_seq_fops); + &hcall_inst_fops); } return 0; diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index a6f101c958e8..8c6e509f6967 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -40,7 +40,7 @@ #include <linux/export.h> #include <linux/console.h> #include <linux/kobject.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/interrupt.h> #include <linux/of.h> #include <linux/slab.h> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 6d47b4a3ce39..e4198700ed1a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -39,6 +39,20 @@ #include "pseries.h" +enum { + DDW_QUERY_PE_DMA_WIN = 0, + DDW_CREATE_PE_DMA_WIN = 1, + DDW_REMOVE_PE_DMA_WIN = 2, + + DDW_APPLICABLE_SIZE +}; + +enum { + DDW_EXT_SIZE = 0, + DDW_EXT_RESET_DMA_WIN = 1, + DDW_EXT_QUERY_OUT_SIZE = 2 +}; + static struct iommu_table_group *iommu_pseries_alloc_group(int node) { struct iommu_table_group *table_group; @@ -334,7 +348,7 @@ struct direct_window { /* Dynamic DMA Window support */ struct ddw_query_response { u32 windows_available; - u32 largest_available_block; + u64 largest_available_block; u32 page_size; u32 migration_capable; }; @@ -767,25 +781,14 @@ static int __init disable_ddw_setup(char *str) early_param("disable_ddw", disable_ddw_setup); -static void remove_ddw(struct device_node *np, bool remove_prop) +static void remove_dma_window(struct device_node *np, u32 *ddw_avail, + struct property *win) { struct dynamic_dma_window_prop *dwp; - struct property *win64; - u32 ddw_avail[3]; u64 liobn; - int ret = 0; - - ret = of_property_read_u32_array(np, "ibm,ddw-applicable", - &ddw_avail[0], 3); - - win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); - if (!win64) - return; - - if (ret || win64->length < sizeof(*dwp)) - goto delprop; + int ret; - dwp = win64->value; + dwp = win->value; liobn = (u64)be32_to_cpu(dwp->liobn); /* clear the whole window, note the arg is in kernel pages */ @@ -798,19 +801,39 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug("%pOF successfully cleared tces in window.\n", np); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); if (ret) pr_warn("%pOF: failed to remove direct window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", - np, ret, ddw_avail[2], liobn); + np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); else pr_debug("%pOF: successfully removed direct window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", - np, ret, ddw_avail[2], liobn); + np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); +} + +static void remove_ddw(struct device_node *np, bool remove_prop) +{ + struct property *win; + u32 ddw_avail[DDW_APPLICABLE_SIZE]; + int ret = 0; -delprop: - if (remove_prop) - ret = of_remove_property(np, win64); + ret = of_property_read_u32_array(np, "ibm,ddw-applicable", + &ddw_avail[0], DDW_APPLICABLE_SIZE); + if (ret) + return; + + win = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win) + return; + + if (win->length >= sizeof(struct dynamic_dma_window_prop)) + remove_dma_window(np, ddw_avail, win); + + if (!remove_prop) + return; + + ret = of_remove_property(np, win); if (ret) pr_warn("%pOF: failed to remove direct window property: %d\n", np, ret); @@ -869,14 +892,62 @@ static int find_existing_ddw_windows(void) } machine_arch_initcall(pseries, find_existing_ddw_windows); +/** + * ddw_read_ext - Get the value of an DDW extension + * @np: device node from which the extension value is to be read. + * @extnum: index number of the extension. + * @value: pointer to return value, modified when extension is available. + * + * Checks if "ibm,ddw-extensions" exists for this node, and get the value + * on index 'extnum'. + * It can be used only to check if a property exists, passing value == NULL. + * + * Returns: + * 0 if extension successfully read + * -EINVAL if the "ibm,ddw-extensions" does not exist, + * -ENODATA if "ibm,ddw-extensions" does not have a value, and + * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension. + */ +static inline int ddw_read_ext(const struct device_node *np, int extnum, + u32 *value) +{ + static const char propname[] = "ibm,ddw-extensions"; + u32 count; + int ret; + + ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count); + if (ret) + return ret; + + if (count < extnum) + return -EOVERFLOW; + + if (!value) + value = &count; + + return of_property_read_u32_index(np, propname, extnum, value); +} + static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, - struct ddw_query_response *query) + struct ddw_query_response *query, + struct device_node *parent) { struct device_node *dn; struct pci_dn *pdn; - u32 cfg_addr; + u32 cfg_addr, ext_query, query_out[5]; u64 buid; - int ret; + int ret, out_sz; + + /* + * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many + * output parameters ibm,query-pe-dma-windows will have, ranging from + * 5 to 6. + */ + ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query); + if (!ret && ext_query == 1) + out_sz = 6; + else + out_sz = 5; /* * Get the config address and phb buid of the PE window. @@ -889,11 +960,28 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, buid = pdn->phb->buid; cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, - cfg_addr, BUID_HI(buid), BUID_LO(buid)); - dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" - " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), - BUID_LO(buid), ret); + ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n", + ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid), + BUID_LO(buid), ret); + + switch (out_sz) { + case 5: + query->windows_available = query_out[0]; + query->largest_available_block = query_out[1]; + query->page_size = query_out[2]; + query->migration_capable = query_out[3]; + break; + case 6: + query->windows_available = query_out[0]; + query->largest_available_block = ((u64)query_out[1] << 32) | + query_out[2]; + query->page_size = query_out[3]; + query->migration_capable = query_out[4]; + break; + } + return ret; } @@ -920,15 +1008,16 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, - cfg_addr, BUID_HI(buid), BUID_LO(buid), - page_shift, window_shift); + ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4, + (u32 *)create, cfg_addr, BUID_HI(buid), + BUID_LO(buid), page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(&dev->dev, "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " - "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1], - cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift, - window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + "(liobn = 0x%x starting addr = %x %x)\n", + ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid), + BUID_LO(buid), page_shift, window_shift, ret, create->liobn, + create->addr_hi, create->addr_lo); return ret; } @@ -978,6 +1067,38 @@ static phys_addr_t ddw_memory_hotplug_max(void) } /* + * Platforms supporting the DDW option starting with LoPAR level 2.7 implement + * ibm,ddw-extensions, which carries the rtas token for + * ibm,reset-pe-dma-windows. + * That rtas-call can be used to restore the default DMA window for the device. + */ +static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) +{ + int ret; + u32 cfg_addr, reset_dma_win; + u64 buid; + struct device_node *dn; + struct pci_dn *pdn; + + ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); + if (ret) + return; + + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); + + ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid), + BUID_LO(buid)); + if (ret) + dev_info(&dev->dev, + "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ", + reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), + ret); +} + +/* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, * and record the dma-offset in the struct device. @@ -996,11 +1117,12 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) int page_shift; u64 dma_addr, max_addr; struct device_node *dn; - u32 ddw_avail[3]; + u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; + bool default_win_removed = false; mutex_lock(&direct_window_init_mutex); @@ -1029,7 +1151,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * the property is actually in the parent, not the PE */ ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", - &ddw_avail[0], 3); + &ddw_avail[0], DDW_APPLICABLE_SIZE); if (ret) goto out_failed; @@ -1040,18 +1162,42 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * of page sizes: supported and supported for migrate-dma. */ dn = pci_device_to_OF_node(dev); - ret = query_ddw(dev, ddw_avail, &query); + ret = query_ddw(dev, ddw_avail, &query, pdn); if (ret != 0) goto out_failed; + /* + * If there is no window available, remove the default DMA window, + * if it's present. This will make all the resources available to the + * new DDW window. + * If anything fails after this, we need to restore it, so also check + * for extensions presence. + */ if (query.windows_available == 0) { - /* - * no additional windows are available for this device. - * We might be able to reallocate the existing window, - * trading in for a larger page size. - */ - dev_dbg(&dev->dev, "no free dynamic windows"); - goto out_failed; + struct property *default_win; + int reset_win_ext; + + default_win = of_find_property(pdn, "ibm,dma-window", NULL); + if (!default_win) + goto out_failed; + + reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL); + if (reset_win_ext) + goto out_failed; + + remove_dma_window(pdn, ddw_avail, default_win); + default_win_removed = true; + + /* Query again, to check if the window is available */ + ret = query_ddw(dev, ddw_avail, &query, pdn); + if (ret != 0) + goto out_failed; + + if (query.windows_available == 0) { + /* no windows are available for this device. */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_failed; + } } if (query.page_size & 4) { page_shift = 24; /* 16MB */ @@ -1068,7 +1214,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) /* check largest block * page size > max memory hotplug addr */ max_addr = ddw_memory_hotplug_max(); if (query.largest_available_block < (max_addr >> page_shift)) { - dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u " + dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu " "%llu-sized pages\n", max_addr, query.largest_available_block, 1ULL << page_shift); goto out_failed; @@ -1142,6 +1288,8 @@ out_free_prop: kfree(win64); out_failed: + if (default_win_removed) + reset_dma_window(dev, pdn); fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); if (!fpdn) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index baf24eacd268..764170fdb0f7 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1724,6 +1724,7 @@ void __init hpte_init_pseries(void) pseries_lpar_register_process_table(0, 0, 0); } +#ifdef CONFIG_PPC_RADIX_MMU void radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); @@ -1731,6 +1732,7 @@ void radix_init_pseries(void) pseries_lpar_register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); } +#endif #ifdef CONFIG_PPC_SMLPAR #define CMO_FREE_HINT_DEFAULT 1 diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index b8d28ab88178..e278390ab28d 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -136,6 +136,39 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) return rc; } +static void show_gpci_data(struct seq_file *m) +{ + struct hv_gpci_request_buffer *buf; + unsigned int affinity_score; + long ret; + + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (buf == NULL) + return; + + /* + * Show the local LPAR's affinity score. + * + * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall. + * The score is at byte 0xB in the output buffer. + */ + memset(&buf->params, 0, sizeof(buf->params)); + buf->params.counter_request = cpu_to_be32(0xB1); + buf->params.starting_index = cpu_to_be32(-1); /* local LPAR */ + buf->params.counter_info_version_in = 0x5; /* v5+ for score */ + ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf), + sizeof(*buf)); + if (ret != H_SUCCESS) { + pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n", + ret, be32_to_cpu(buf->params.detail_rc)); + goto out; + } + affinity_score = buf->bytes[0xB]; + seq_printf(m, "partition_affinity_score=%u\n", affinity_score); +out: + kfree(buf); +} + static unsigned h_pic(unsigned long *pool_idle_time, unsigned long *num_procs) { @@ -487,6 +520,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) partition_active_processors * 100); } + show_gpci_data(m); + seq_printf(m, "partition_active_processors=%d\n", partition_active_processors); diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index a88a707a608a..835163f54244 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, static ssize_t perf_stats_show(struct device *dev, struct device_attribute *attr, char *buf) { - int index, rc; + int index; + ssize_t rc; struct seq_buf s; struct papr_scm_perf_stat *stat; struct papr_scm_perf_stats *stats; @@ -820,9 +821,9 @@ static ssize_t perf_stats_show(struct device *dev, free_stats: kfree(stats); - return rc ? rc : seq_buf_used(&s); + return rc ? rc : (ssize_t)seq_buf_used(&s); } -DEVICE_ATTR_ADMIN_RO(perf_stats); +static DEVICE_ATTR_ADMIN_RO(perf_stats); static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -897,6 +898,9 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) p->bus_desc.of_node = p->pdev->dev.of_node; p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL); + /* Set the dimm command family mask to accept PDSMs */ + set_bit(NVDIMM_FAMILY_PAPR, &p->bus_desc.dimm_family_mask); + if (!p->bus_desc.provider_name) return -ENOMEM; diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c index bbb97169bf63..6268545947b8 100644 --- a/arch/powerpc/platforms/pseries/rng.c +++ b/arch/powerpc/platforms/pseries/rng.c @@ -36,6 +36,7 @@ static __init int rng_init(void) ppc_md.get_random_seed = pseries_get_random_long; + of_node_put(dn); return 0; } machine_subsys_initcall(pseries, rng_init); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2f4ee0a90284..633c45ec406d 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -519,9 +519,15 @@ static void init_cpu_char_feature_flags(struct h_cpu_char_result *result) if (result->character & H_CPU_CHAR_BCCTR_FLUSH_ASSIST) security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST); + if (result->character & H_CPU_CHAR_BCCTR_LINK_FLUSH_ASSIST) + security_ftr_set(SEC_FTR_BCCTR_LINK_FLUSH_ASSIST); + if (result->behaviour & H_CPU_BEHAV_FLUSH_COUNT_CACHE) security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE); + if (result->behaviour & H_CPU_BEHAV_FLUSH_LINK_STACK) + security_ftr_set(SEC_FTR_FLUSH_LINK_STACK); + /* * The features below are enabled by default, so we instead look to see * if firmware has *disabled* them, and clear them if so. diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index e6d7a344d9f2..7b739cc7a8a9 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -7,6 +7,7 @@ */ #include <linux/mm.h> +#include <linux/memblock.h> #include <asm/machdep.h> #include <asm/svm.h> #include <asm/swiotlb.h> @@ -35,6 +36,31 @@ static int __init init_svm(void) } machine_early_initcall(pseries, init_svm); +/* + * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it + * can allocate the buffer anywhere in memory. Since the hypervisor doesn't have + * any addressing limitation, we don't need to allocate it in low addresses. + */ +void __init svm_swiotlb_init(void) +{ + unsigned char *vstart; + unsigned long bytes, io_tlb_nslabs; + + io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); + if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false)) + return; + + if (io_tlb_start) + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + panic("SVM: Cannot allocate SWIOTLB buffer"); +} + int set_memory_encrypted(unsigned long addr, int numpages) { if (!PAGE_ALIGNED(addr)) diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 0487b26f6f1a..b2797cfe4e2b 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -20,7 +20,7 @@ #include <linux/console.h> #include <linux/export.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/kobject.h> #include <asm/iommu.h> @@ -608,6 +608,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = { .get_required_mask = dma_iommu_get_required_mask, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; /** diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c index ad8117148ea3..21b9d1bf39ff 100644 --- a/arch/powerpc/sysdev/xics/icp-hv.c +++ b/arch/powerpc/sysdev/xics/icp-hv.c @@ -174,6 +174,7 @@ int icp_hv_init(void) icp_ops = &icp_hv_ops; + of_node_put(np); return 0; } diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f591be9f01f4..a80440af491a 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1565,7 +1565,7 @@ static int __init xive_off(char *arg) } __setup("xive=off", xive_off); -void xive_debug_show_cpu(struct seq_file *m, int cpu) +static void xive_debug_show_cpu(struct seq_file *m, int cpu) { struct xive_cpu *xc = per_cpu(xive_cpu, cpu); @@ -1599,7 +1599,7 @@ void xive_debug_show_cpu(struct seq_file *m, int cpu) seq_puts(m, "\n"); } -void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d) +static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d) { struct irq_chip *chip = irq_data_get_irq_chip(d); int rc; diff --git a/arch/powerpc/tools/checkpatch.sh b/arch/powerpc/tools/checkpatch.sh index 3ce5c093b19d..91c04802ec31 100755 --- a/arch/powerpc/tools/checkpatch.sh +++ b/arch/powerpc/tools/checkpatch.sh @@ -9,7 +9,6 @@ script_base=$(realpath $(dirname $0)) exec $script_base/../../../scripts/checkpatch.pl \ --subjective \ --no-summary \ - --max-line-length=90 \ --show-types \ --ignore ARCH_INCLUDE_LINUX \ --ignore BIT_MACRO \ diff --git a/arch/powerpc/tools/unrel_branch_check.sh b/arch/powerpc/tools/unrel_branch_check.sh index 6e6a30aea3ed..8301efee1e6c 100755 --- a/arch/powerpc/tools/unrel_branch_check.sh +++ b/arch/powerpc/tools/unrel_branch_check.sh @@ -1,60 +1,79 @@ -# Copyright © 2016 IBM Corporation +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# Copyright © 2016,2020 IBM Corporation # -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version -# 2 of the License, or (at your option) any later version. -# -# This script checks the relocations of a vmlinux for "suspicious" -# branches from unrelocated code (head_64.S code). - -# Turn this on if you want more debug output: -# set -x +# This script checks the unrelocated code of a vmlinux for "suspicious" +# branches to relocated code (head_64.S code). -# Have Kbuild supply the path to objdump so we handle cross compilation. +# Have Kbuild supply the path to objdump and nm so we handle cross compilation. objdump="$1" -vmlinux="$2" - -#__end_interrupts should be located within the first 64K - -end_intr=0x$( -$objdump -R "$vmlinux" -d --start-address=0xc000000000000000 \ - --stop-address=0xc000000000010000 | -grep '\<__end_interrupts>:' | -awk '{print $1}' -) - -BRANCHES=$( -$objdump -R "$vmlinux" -D --start-address=0xc000000000000000 \ - --stop-address=${end_intr} | -grep -e "^c[0-9a-f]*:[[:space:]]*\([0-9a-f][0-9a-f][[:space:]]\)\{4\}[[:space:]]*b" | -grep -v '\<__start_initialization_multiplatform>' | -grep -v -e 'b.\?.\?ctr' | -grep -v -e 'b.\?.\?lr' | -sed -e 's/\bbt.\?[[:space:]]*[[:digit:]][[:digit:]]*,/beq/' \ - -e 's/\bbf.\?[[:space:]]*[[:digit:]][[:digit:]]*,/bne/' \ - -e 's/[[:space:]]0x/ /' \ - -e 's/://' | -awk '{ print $1 ":" $6 ":0x" $7 ":" $8 " "}' -) - -for tuple in $BRANCHES -do - from=`echo $tuple | cut -d':' -f1` - branch=`echo $tuple | cut -d':' -f2` - to=`echo $tuple | cut -d':' -f3 | sed 's/cr[0-7],//'` - sym=`echo $tuple | cut -d':' -f4` - - if (( $to > $end_intr )) - then - if [ -z "$bad_branches" ]; then - echo "WARNING: Unrelocated relative branches" - bad_branches="yes" +nm="$2" +vmlinux="$3" + +kstart=0xc000000000000000 + +end_intr=0x$($nm -p "$vmlinux" | + sed -E -n '/\s+[[:alpha:]]\s+__end_interrupts\s*$/{s///p;q}') +if [ "$end_intr" = "0x" ]; then + exit 0 +fi + +# we know that there is a correct branch to +# __start_initialization_multiplatform, so find its address +# so we can exclude it. +sim=0x$($nm -p "$vmlinux" | + sed -E -n '/\s+[[:alpha:]]\s+__start_initialization_multiplatform\s*$/{s///p;q}') + +$objdump -D --no-show-raw-insn --start-address="$kstart" --stop-address="$end_intr" "$vmlinux" | +sed -E -n ' +# match lines that start with a kernel address +/^c[0-9a-f]*:\s*b/ { + # drop branches via ctr or lr + /\<b.?.?(ct|l)r/d + # cope with some differences between Clang and GNU objdumps + s/\<bt.?\s*[[:digit:]]+,/beq/ + s/\<bf.?\s*[[:digit:]]+,/bne/ + # tidy up + s/\s0x/ / + s/:// + # format for the loop below + s/^(\S+)\s+(\S+)\s+(\S+)\s*(\S*).*$/\1:\2:\3:\4/ + # strip out condition registers + s/:cr[0-7],/:/ + p +}' | { + +all_good=true +while IFS=: read -r from branch to sym; do + case "$to" in + c*) to="0x$to" + ;; + .+*) + to=${to#.+} + if [ "$branch" = 'b' ]; then + if (( to >= 0x2000000 )); then + to=$(( to - 0x4000000 )) + fi + elif (( to >= 0x8000 )); then + to=$(( to - 0x10000 )) + fi + printf -v to '0x%x' $(( "0x$from" + to )) + ;; + *) printf 'Unkown branch format\n' + ;; + esac + if [ "$to" = "$sim" ]; then + continue + fi + if (( to > end_intr )); then + if $all_good; then + printf '%s\n' 'WARNING: Unrelocated relative branches' + all_good=false fi - echo "$from $branch-> $to $sym" + printf '%s %s-> %s %s\n' "$from" "$branch" "$to" "$sym" fi done -if [ -z "$bad_branches" ]; then - exit 0 -fi +$all_good + +} diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index df7bca00f5ec..55c43a6c9111 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -969,6 +969,7 @@ static void insert_cpu_bpts(void) brk.address = dabr[i].address; brk.type = (dabr[i].enabled & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; brk.len = 8; + brk.hw_len = 8; __set_breakpoint(i, &brk); } } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b7821ac36d28..d5e7ca08f22c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -38,6 +38,7 @@ config RISCV select GENERIC_ARCH_TOPOLOGY if SMP select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS + select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO select GENERIC_IOREMAP select GENERIC_IRQ_MULTI_HANDLER @@ -87,6 +88,7 @@ config RISCV select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select SET_FS config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT @@ -388,6 +390,28 @@ config CMDLINE_FORCE endchoice +config EFI_STUB + bool + +config EFI + bool "UEFI runtime support" + depends on OF + select LIBFDT + select UCS2_STRING + select EFI_PARAMS_FROM_FDT + select EFI_STUB + select EFI_GENERIC_STUB + select EFI_RUNTIME_WRAPPERS + select RISCV_ISA_C + depends on MMU + default y + help + This option provides support for runtime services provided + by UEFI firmware (such as non-volatile variables, realtime + clock, and platform reset). A UEFI stub is also provided to + allow the kernel to be booted as an EFI application. This + is only useful on systems that have UEFI firmware. + endmenu config BUILTIN_DTB @@ -400,3 +424,5 @@ menu "Power management options" source "kernel/power/Kconfig" endmenu + +source "drivers/firmware/Kconfig" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index fb6e37db836d..0289a97325d1 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -53,9 +53,6 @@ endif ifeq ($(CONFIG_CMODEL_MEDANY),y) KBUILD_CFLAGS += -mcmodel=medany endif -ifeq ($(CONFIG_MODULE_SECTIONS),y) - KBUILD_LDS_MODULE += $(srctree)/arch/riscv/kernel/module.lds -endif ifeq ($(CONFIG_PERF_EVENTS),y) KBUILD_CFLAGS += -fno-omit-frame-pointer endif @@ -80,6 +77,7 @@ head-y := arch/riscv/kernel/head.o core-y += arch/riscv/ libs-y += arch/riscv/lib/ +libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a PHONY += vdso_install vdso_install: diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index d58c93efb603..d222d353d86d 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -130,3 +130,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y # CONFIG_SYSFS_SYSCALL is not set +CONFIG_EFI=y diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 3d9410bb4de0..59dd7be55005 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += kvm_para.h diff --git a/arch/riscv/include/asm/cacheinfo.h b/arch/riscv/include/asm/cacheinfo.h index 5d9662e9aba8..d1a365215ec0 100644 --- a/arch/riscv/include/asm/cacheinfo.h +++ b/arch/riscv/include/asm/cacheinfo.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 SiFive + */ #ifndef _ASM_RISCV_CACHEINFO_H #define _ASM_RISCV_CACHEINFO_H @@ -11,5 +14,7 @@ struct riscv_cacheinfo_ops { }; void riscv_set_cacheinfo_ops(struct riscv_cacheinfo_ops *ops); +uintptr_t get_cache_size(u32 level, enum cache_type type); +uintptr_t get_cache_geometry(u32 level, enum cache_type type); #endif /* _ASM_RISCV_CACHEINFO_H */ diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h new file mode 100644 index 000000000000..7542282f1141 --- /dev/null +++ b/arch/riscv/include/asm/efi.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef _ASM_EFI_H +#define _ASM_EFI_H + +#include <asm/csr.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/ptrace.h> +#include <asm/tlbflush.h> + +#ifdef CONFIG_EFI +extern void efi_init(void); +#else +#define efi_init() +#endif + +int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); + +#define arch_efi_call_virt_setup() efi_virtmap_load() +#define arch_efi_call_virt_teardown() efi_virtmap_unload() + +#define arch_efi_call_virt(p, f, args...) p->f(args) + +#define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE) + +/* on RISC-V, the FDT may be located anywhere in system RAM */ +static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) +{ + return ULONG_MAX; +} + +/* Load initrd at enough distance from DRAM start */ +static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) +{ + return image_addr + SZ_256M; +} + +#define alloc_screen_info(x...) (&screen_info) + +static inline void free_screen_info(struct screen_info *si) +{ +} + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} + +void efi_virtmap_load(void); +void efi_virtmap_unload(void); + +#endif /* _ASM_EFI_H */ diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index d83a4efd052b..5c725e1df58b 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -11,6 +11,7 @@ #include <uapi/asm/elf.h> #include <asm/auxvec.h> #include <asm/byteorder.h> +#include <asm/cacheinfo.h> /* * These are used to set parameters in the core dumps. @@ -61,6 +62,18 @@ extern unsigned long elf_hwcap; do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (elf_addr_t)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_L1I_CACHESIZE, \ + get_cache_size(1, CACHE_TYPE_INST)); \ + NEW_AUX_ENT(AT_L1I_CACHEGEOMETRY, \ + get_cache_geometry(1, CACHE_TYPE_INST)); \ + NEW_AUX_ENT(AT_L1D_CACHESIZE, \ + get_cache_size(1, CACHE_TYPE_DATA)); \ + NEW_AUX_ENT(AT_L1D_CACHEGEOMETRY, \ + get_cache_geometry(1, CACHE_TYPE_DATA)); \ + NEW_AUX_ENT(AT_L2_CACHESIZE, \ + get_cache_size(2, CACHE_TYPE_UNIFIED)); \ + NEW_AUX_ENT(AT_L2_CACHEGEOMETRY, \ + get_cache_geometry(2, CACHE_TYPE_UNIFIED)); \ } while (0) #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 1ff075a8dfc7..54cbf07fb4e9 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -22,14 +22,24 @@ */ enum fixed_addresses { FIX_HOLE, -#define FIX_FDT_SIZE SZ_1M - FIX_FDT_END, - FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1, FIX_PTE, FIX_PMD, FIX_TEXT_POKE1, FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, + + __end_of_permanent_fixed_addresses, + /* + * Temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + */ +#define NR_FIX_BTMAPS (SZ_256K / PAGE_SIZE) +#define FIX_BTMAPS_SLOTS 7 +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + __end_of_fixed_addresses }; diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 3835c3295dc5..c025a746a148 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/pgtable.h> #include <asm/mmiowb.h> +#include <asm/early_ioremap.h> /* * MMIO access functions are separated out to break dependency cycles diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 967eacb01ab5..dabcf2cfb3dc 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -20,6 +20,8 @@ typedef struct { #endif } mm_context_t; +void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot); #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/kernel/module.lds b/arch/riscv/include/asm/module.lds.h index 295ecfb341a2..4254ff2ff049 100644 --- a/arch/riscv/kernel/module.lds +++ b/arch/riscv/include/asm/module.lds.h @@ -1,8 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2017 Andes Technology Corporation */ - +#ifdef CONFIG_MODULE_SECTIONS SECTIONS { .plt (NOLOAD) : { BYTE(0) } .got (NOLOAD) : { BYTE(0) } .got.plt (NOLOAD) : { BYTE(0) } } +#endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index eaea1f717010..183f1f4b2ae6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -100,6 +100,10 @@ #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC) +#define PAGE_KERNEL_READ __pgprot(_PAGE_KERNEL & ~_PAGE_WRITE) +#define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC) +#define PAGE_KERNEL_READ_EXEC __pgprot((_PAGE_KERNEL & ~_PAGE_WRITE) \ + | _PAGE_EXEC) #define PAGE_TABLE __pgprot(_PAGE_TABLE) @@ -464,6 +468,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl #define kern_addr_valid(addr) (1) /* FIXME */ extern void *dtb_early_va; +extern uintptr_t dtb_early_pa; void setup_bootmem(void); void paging_init(void); diff --git a/arch/riscv/include/asm/sections.h b/arch/riscv/include/asm/sections.h new file mode 100644 index 000000000000..3a9971b1210f --- /dev/null +++ b/arch/riscv/include/asm/sections.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef __ASM_SECTIONS_H +#define __ASM_SECTIONS_H + +#include <asm-generic/sections.h> + +extern char _start[]; +extern char _start_kernel[]; + +#endif /* __ASM_SECTIONS_H */ diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index d86cb17bbabe..32c73ba1d531 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -10,4 +10,28 @@ /* vDSO location */ #define AT_SYSINFO_EHDR 33 +/* + * The set of entries below represent more extensive information + * about the caches, in the form of two entry per cache type, + * one entry containing the cache size in bytes, and the other + * containing the cache line size in bytes in the bottom 16 bits + * and the cache associativity in the next 16 bits. + * + * The associativity is such that if N is the 16-bit value, the + * cache is N way set associative. A value if 0xffff means fully + * associative, a value of 1 means directly mapped. + * + * For all these fields, a value of 0 means that the information + * is not known. + */ +#define AT_L1I_CACHESIZE 40 +#define AT_L1I_CACHEGEOMETRY 41 +#define AT_L1D_CACHESIZE 42 +#define AT_L1D_CACHEGEOMETRY 43 +#define AT_L2_CACHESIZE 44 +#define AT_L2_CACHEGEOMETRY 45 + +/* entries in ARCH_DLINFO */ +#define AT_VECTOR_SIZE_ARCH 7 + #endif /* _UAPI_ASM_RISCV_AUXVEC_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index dc93710f0b2f..fa896c5f7ccb 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -55,4 +55,6 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_EFI) += efi.o + clean: diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index bd0f122965c3..de59dd457b41 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -3,7 +3,6 @@ * Copyright (C) 2017 SiFive */ -#include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/of.h> #include <linux/of_device.h> @@ -25,12 +24,84 @@ cache_get_priv_group(struct cacheinfo *this_leaf) return NULL; } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct device_node *node, - enum cache_type type, unsigned int level) +static struct cacheinfo *get_cacheinfo(u32 level, enum cache_type type) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(smp_processor_id()); + struct cacheinfo *this_leaf; + int index; + + for (index = 0; index < this_cpu_ci->num_leaves; index++) { + this_leaf = this_cpu_ci->info_list + index; + if (this_leaf->level == level && this_leaf->type == type) + return this_leaf; + } + + return NULL; +} + +uintptr_t get_cache_size(u32 level, enum cache_type type) +{ + struct cacheinfo *this_leaf = get_cacheinfo(level, type); + + return this_leaf ? this_leaf->size : 0; +} + +uintptr_t get_cache_geometry(u32 level, enum cache_type type) +{ + struct cacheinfo *this_leaf = get_cacheinfo(level, type); + + return this_leaf ? (this_leaf->ways_of_associativity << 16 | + this_leaf->coherency_line_size) : + 0; +} + +static void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, + unsigned int level, unsigned int size, + unsigned int sets, unsigned int line_size) { this_leaf->level = level; this_leaf->type = type; + this_leaf->size = size; + this_leaf->number_of_sets = sets; + this_leaf->coherency_line_size = line_size; + + /* + * If the cache is fully associative, there is no need to + * check the other properties. + */ + if (sets == 1) + return; + + /* + * Set the ways number for n-ways associative, make sure + * all properties are big than zero. + */ + if (sets > 0 && size > 0 && line_size > 0) + this_leaf->ways_of_associativity = (size / sets) / line_size; +} + +static void fill_cacheinfo(struct cacheinfo **this_leaf, + struct device_node *node, unsigned int level) +{ + unsigned int size, sets, line_size; + + if (!of_property_read_u32(node, "cache-size", &size) && + !of_property_read_u32(node, "cache-block-size", &line_size) && + !of_property_read_u32(node, "cache-sets", &sets)) { + ci_leaf_init((*this_leaf)++, CACHE_TYPE_UNIFIED, level, size, sets, line_size); + } + + if (!of_property_read_u32(node, "i-cache-size", &size) && + !of_property_read_u32(node, "i-cache-sets", &sets) && + !of_property_read_u32(node, "i-cache-block-size", &line_size)) { + ci_leaf_init((*this_leaf)++, CACHE_TYPE_INST, level, size, sets, line_size); + } + + if (!of_property_read_u32(node, "d-cache-size", &size) && + !of_property_read_u32(node, "d-cache-sets", &sets) && + !of_property_read_u32(node, "d-cache-block-size", &line_size)) { + ci_leaf_init((*this_leaf)++, CACHE_TYPE_DATA, level, size, sets, line_size); + } } static int __init_cache_level(unsigned int cpu) @@ -83,29 +154,24 @@ static int __populate_cache_leaves(unsigned int cpu) struct device_node *prev = NULL; int levels = 1, level = 1; - if (of_property_read_bool(np, "cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); + /* Level 1 caches in cpu node */ + fill_cacheinfo(&this_leaf, np, level); + /* Next level caches in cache nodes */ prev = np; while ((np = of_find_next_cache_node(np))) { of_node_put(prev); prev = np; + if (!of_device_is_compatible(np, "cache")) break; if (of_property_read_u32(np, "cache-level", &level)) break; if (level <= levels) break; - if (of_property_read_bool(np, "cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); + + fill_cacheinfo(&this_leaf, np, level); + levels = level; } of_node_put(np); diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S new file mode 100644 index 000000000000..8e733aa48ba6 --- /dev/null +++ b/arch/riscv/kernel/efi-header.S @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * Adapted from arch/arm64/kernel/efi-header.S + */ + +#include <linux/pe.h> +#include <linux/sizes.h> + + .macro __EFI_PE_HEADER + .long PE_MAGIC +coff_header: +#ifdef CONFIG_64BIT + .short IMAGE_FILE_MACHINE_RISCV64 // Machine +#else + .short IMAGE_FILE_MACHINE_RISCV32 // Machine +#endif + .short section_count // NumberOfSections + .long 0 // TimeDateStamp + .long 0 // PointerToSymbolTable + .long 0 // NumberOfSymbols + .short section_table - optional_header // SizeOfOptionalHeader + .short IMAGE_FILE_DEBUG_STRIPPED | \ + IMAGE_FILE_EXECUTABLE_IMAGE | \ + IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics + +optional_header: +#ifdef CONFIG_64BIT + .short PE_OPT_MAGIC_PE32PLUS // PE32+ format +#else + .short PE_OPT_MAGIC_PE32 // PE32 format +#endif + .byte 0x02 // MajorLinkerVersion + .byte 0x14 // MinorLinkerVersion + .long __pecoff_text_end - efi_header_end // SizeOfCode + .long __pecoff_data_virt_size // SizeOfInitializedData + .long 0 // SizeOfUninitializedData + .long __efistub_efi_pe_entry - _start // AddressOfEntryPoint + .long efi_header_end - _start // BaseOfCode +#ifdef CONFIG_32BIT + .long __pecoff_text_end - _start // BaseOfData +#endif + +extra_header_fields: + .quad 0 // ImageBase + .long PECOFF_SECTION_ALIGNMENT // SectionAlignment + .long PECOFF_FILE_ALIGNMENT // FileAlignment + .short 0 // MajorOperatingSystemVersion + .short 0 // MinorOperatingSystemVersion + .short LINUX_EFISTUB_MAJOR_VERSION // MajorImageVersion + .short LINUX_EFISTUB_MINOR_VERSION // MinorImageVersion + .short 0 // MajorSubsystemVersion + .short 0 // MinorSubsystemVersion + .long 0 // Win32VersionValue + + .long _end - _start // SizeOfImage + + // Everything before the kernel image is considered part of the header + .long efi_header_end - _start // SizeOfHeaders + .long 0 // CheckSum + .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem + .short 0 // DllCharacteristics + .quad 0 // SizeOfStackReserve + .quad 0 // SizeOfStackCommit + .quad 0 // SizeOfHeapReserve + .quad 0 // SizeOfHeapCommit + .long 0 // LoaderFlags + .long (section_table - .) / 8 // NumberOfRvaAndSizes + + .quad 0 // ExportTable + .quad 0 // ImportTable + .quad 0 // ResourceTable + .quad 0 // ExceptionTable + .quad 0 // CertificationTable + .quad 0 // BaseRelocationTable + + // Section table +section_table: + .ascii ".text\0\0\0" + .long __pecoff_text_end - efi_header_end // VirtualSize + .long efi_header_end - _start // VirtualAddress + .long __pecoff_text_end - efi_header_end // SizeOfRawData + .long efi_header_end - _start // PointerToRawData + + .long 0 // PointerToRelocations + .long 0 // PointerToLineNumbers + .short 0 // NumberOfRelocations + .short 0 // NumberOfLineNumbers + .long IMAGE_SCN_CNT_CODE | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_EXECUTE // Characteristics + + .ascii ".data\0\0\0" + .long __pecoff_data_virt_size // VirtualSize + .long __pecoff_text_end - _start // VirtualAddress + .long __pecoff_data_raw_size // SizeOfRawData + .long __pecoff_text_end - _start // PointerToRawData + + .long 0 // PointerToRelocations + .long 0 // PointerToLineNumbers + .short 0 // NumberOfRelocations + .short 0 // NumberOfLineNumbers + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_WRITE // Characteristics + + .set section_count, (. - section_table) / 40 + + .balign 0x1000 +efi_header_end: + .endm diff --git a/arch/riscv/kernel/efi.c b/arch/riscv/kernel/efi.c new file mode 100644 index 000000000000..024159298231 --- /dev/null +++ b/arch/riscv/kernel/efi.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * Adapted from arch/arm64/kernel/efi.c + */ + +#include <linux/efi.h> +#include <linux/init.h> + +#include <asm/efi.h> +#include <asm/pgtable.h> +#include <asm/pgtable-bits.h> + +/* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. Also take the new (optional) RO/XP bits into account. + */ +static __init pgprot_t efimem_to_pgprot_map(efi_memory_desc_t *md) +{ + u64 attr = md->attribute; + u32 type = md->type; + + if (type == EFI_MEMORY_MAPPED_IO) + return PAGE_KERNEL; + + /* R-- */ + if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == + (EFI_MEMORY_XP | EFI_MEMORY_RO)) + return PAGE_KERNEL_READ; + + /* R-X */ + if (attr & EFI_MEMORY_RO) + return PAGE_KERNEL_READ_EXEC; + + /* RW- */ + if (((attr & (EFI_MEMORY_RP | EFI_MEMORY_WP | EFI_MEMORY_XP)) == + EFI_MEMORY_XP) || + type != EFI_RUNTIME_SERVICES_CODE) + return PAGE_KERNEL; + + /* RWX */ + return PAGE_KERNEL_EXEC; +} + +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pgprot_t prot = __pgprot(pgprot_val(efimem_to_pgprot_map(md)) & + ~(_PAGE_GLOBAL)); + int i; + + /* RISC-V maps one page at a time */ + for (i = 0; i < md->num_pages; i++) + create_pgd_mapping(mm->pgd, md->virt_addr + i * PAGE_SIZE, + md->phys_addr + i * PAGE_SIZE, + PAGE_SIZE, prot); + return 0; +} + +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) +{ + efi_memory_desc_t *md = data; + pte_t pte = READ_ONCE(*ptep); + unsigned long val; + + if (md->attribute & EFI_MEMORY_RO) { + val = pte_val(pte) & ~_PAGE_WRITE; + val = pte_val(pte) | _PAGE_READ; + pte = __pte(val); + } + if (md->attribute & EFI_MEMORY_XP) { + val = pte_val(pte) & ~_PAGE_EXEC; + pte = __pte(val); + } + set_pte(ptep, pte); + + return 0; +} + +int __init efi_set_mapping_permissions(struct mm_struct *mm, + efi_memory_desc_t *md) +{ + BUG_ON(md->type != EFI_RUNTIME_SERVICES_CODE && + md->type != EFI_RUNTIME_SERVICES_DATA); + + /* + * Calling apply_to_page_range() is only safe on regions that are + * guaranteed to be mapped down to pages. Since we are only called + * for regions that have been mapped using efi_create_mapping() above + * (and this is checked by the generic Memory Attributes table parsing + * routines), there is no need to check that again here. + */ + return apply_to_page_range(mm, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + set_permissions, md); +} diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 0a4e81b8dc79..11e2a4fe66e0 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -3,7 +3,6 @@ * Copyright (C) 2012 Regents of the University of California */ -#include <asm/thread_info.h> #include <asm/asm-offsets.h> #include <asm/asm.h> #include <linux/init.h> @@ -13,6 +12,7 @@ #include <asm/csr.h> #include <asm/hwcap.h> #include <asm/image.h> +#include "efi-header.S" __HEAD ENTRY(_start) @@ -22,10 +22,18 @@ ENTRY(_start) * Do not modify it without modifying the structure and all bootloaders * that expects this header format!! */ +#ifdef CONFIG_EFI + /* + * This instruction decodes to "MZ" ASCII required by UEFI. + */ + c.li s4,-13 + j _start_kernel +#else /* jump to start kernel */ j _start_kernel /* reserved */ .word 0 +#endif .balign 8 #if __riscv_xlen == 64 /* Image load offset(2MB) from start of RAM */ @@ -43,7 +51,14 @@ ENTRY(_start) .ascii RISCV_IMAGE_MAGIC .balign 4 .ascii RISCV_IMAGE_MAGIC2 +#ifdef CONFIG_EFI + .word pe_head_start - _start +pe_head_start: + + __EFI_PE_HEADER +#else .word 0 +#endif .align 2 #ifdef CONFIG_MMU @@ -259,7 +274,6 @@ clear_bss_done: #endif /* Start the kernel */ call soc_early_init - call parse_dtb tail start_kernel .Lsecondary_start: diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index 105fb0496b24..b48dda3d04f6 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,6 +16,4 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); extern void *__cpu_up_stack_pointer[]; extern void *__cpu_up_task_pointer[]; -void __init parse_dtb(void); - #endif /* __ASM_HEAD_H */ diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h new file mode 100644 index 000000000000..8c212efb37a6 --- /dev/null +++ b/arch/riscv/kernel/image-vars.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * Linker script variables to be set after section resolution, as + * ld.lld does not like variables assigned before SECTIONS is processed. + * Based on arch/arm64/kerne/image-vars.h + */ +#ifndef __RISCV_KERNEL_IMAGE_VARS_H +#define __RISCV_KERNEL_IMAGE_VARS_H + +#ifndef LINKER_SCRIPT +#error This file should only be included in vmlinux.lds.S +#endif + +#ifdef CONFIG_EFI + +/* + * The EFI stub has its own symbol namespace prefixed by __efistub_, to + * isolate it from the kernel proper. The following symbols are legally + * accessed by the stub, so provide some aliases to make them accessible. + * Only include data symbols here, or text symbols of functions that are + * guaranteed to be safe when executed at another offset than they were + * linked at. The routines below are all implemented in assembler in a + * position independent manner + */ +__efistub_memcmp = memcmp; +__efistub_memchr = memchr; +__efistub_memcpy = memcpy; +__efistub_memmove = memmove; +__efistub_memset = memset; +__efistub_strlen = strlen; +__efistub_strnlen = strnlen; +__efistub_strcmp = strcmp; +__efistub_strncmp = strncmp; +__efistub_strrchr = strrchr; + +#ifdef CONFIG_KASAN +__efistub___memcpy = memcpy; +__efistub___memmove = memmove; +__efistub___memset = memset; +#endif + +__efistub__start = _start; +__efistub__start_kernel = _start_kernel; +__efistub__end = _end; +__efistub__edata = _edata; +__efistub_screen_info = screen_info; + +#endif + +#endif /* __RISCV_KERNEL_IMAGE_VARS_H */ diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 2c6dd329312b..4c96ac198e14 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -17,19 +17,22 @@ #include <linux/sched/task.h> #include <linux/swiotlb.h> #include <linux/smp.h> +#include <linux/efi.h> #include <asm/cpu_ops.h> +#include <asm/early_ioremap.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/sbi.h> #include <asm/tlbflush.h> #include <asm/thread_info.h> #include <asm/kasan.h> +#include <asm/efi.h> #include "head.h" -#ifdef CONFIG_DUMMY_CONSOLE -struct screen_info screen_info = { +#if defined(CONFIG_DUMMY_CONSOLE) || defined(CONFIG_EFI) +struct screen_info screen_info __section(.data) = { .orig_video_lines = 30, .orig_video_cols = 80, .orig_video_mode = 0, @@ -48,8 +51,9 @@ atomic_t hart_lottery __section(.sdata); unsigned long boot_cpu_hartid; static DEFINE_PER_CPU(struct cpu, cpu_devices); -void __init parse_dtb(void) +static void __init parse_dtb(void) { + /* Early scan of device tree from init memory */ if (early_init_dt_scan(dtb_early_va)) return; @@ -62,6 +66,7 @@ void __init parse_dtb(void) void __init setup_arch(char **cmdline_p) { + parse_dtb(); init_mm.start_code = (unsigned long) _stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; @@ -69,14 +74,19 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; + early_ioremap_setup(); parse_early_param(); + efi_init(); setup_bootmem(); paging_init(); #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - unflatten_device_tree(); + if (early_init_dt_verify(__va(dtb_early_pa))) + unflatten_device_tree(); + else + pr_err("No DTB found in kernel mappings\n"); #endif #ifdef CONFIG_SWIOTLB diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index e996e08f1061..bc6841867b51 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -313,8 +313,6 @@ asmlinkage __visible void do_notify_resume(struct pt_regs *regs, if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 478e7338ddc1..7d6a94d45ec9 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -49,7 +49,7 @@ $(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE # refer to these symbols in the kernel code rather than hand-coded addresses. SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ - -Wl,--build-id -Wl,--hash-style=both + -Wl,--build-id=sha1 -Wl,--hash-style=both $(obj)/vdso-dummy.o: $(src)/vdso.lds $(obj)/rt_sigreturn.o FORCE $(call if_changed,vdsold) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 67db80e12d1f..3ffbd6cbdb86 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -10,6 +10,7 @@ #include <asm/cache.h> #include <asm/thread_info.h> #include <asm/set_memory.h> +#include "image-vars.h" #include <linux/sizes.h> OUTPUT_ARCH(riscv) @@ -17,6 +18,9 @@ ENTRY(_start) jiffies = jiffies_64; +PECOFF_SECTION_ALIGNMENT = 0x1000; +PECOFF_FILE_ALIGNMENT = 0x200; + SECTIONS { /* Beginning of code and text segment */ @@ -66,6 +70,11 @@ SECTIONS _etext = .; } +#ifdef CONFIG_EFI + . = ALIGN(PECOFF_SECTION_ALIGNMENT); + __pecoff_text_end = .; +#endif + INIT_DATA_SECTION(16) /* Start of data section */ @@ -84,16 +93,26 @@ SECTIONS .sdata : { __global_pointer$ = . + 0x800; *(.sdata*) - /* End of data section */ - _edata = .; } +#ifdef CONFIG_EFI + .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } + __pecoff_data_raw_size = ABSOLUTE(. - __pecoff_text_end); +#endif + + /* End of data section */ + _edata = .; + BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) .rel.dyn : { *(.rel.dyn*) } +#ifdef CONFIG_EFI + . = ALIGN(PECOFF_SECTION_ALIGNMENT); + __pecoff_data_virt_size = ABSOLUTE(. - __pecoff_text_end); +#endif _end = .; STABS_DEBUG diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 716d64e36f83..1359e21c0c62 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -19,6 +19,167 @@ #include "../kernel/head.h" +static inline void no_context(struct pt_regs *regs, unsigned long addr) +{ + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + bust_spinlocks(1); + pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", + (addr < PAGE_SIZE) ? "NULL pointer dereference" : + "paging request", addr); + die(regs, "Oops"); + do_exit(SIGKILL); +} + +static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) +{ + if (fault & VM_FAULT_OOM) { + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + pagefault_out_of_memory(); + return; + } else if (fault & VM_FAULT_SIGBUS) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + do_trap(regs, SIGBUS, BUS_ADRERR, addr); + return; + } + BUG(); +} + +static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) +{ + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ + mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); + return; + } + + no_context(regs, addr); +} + +static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) +{ + pgd_t *pgd, *pgd_k; + pud_t *pud, *pud_k; + p4d_t *p4d, *p4d_k; + pmd_t *pmd, *pmd_k; + pte_t *pte_k; + int index; + + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) + return do_trap(regs, SIGSEGV, code, addr); + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "tsk->active_mm->pgd" here. + * We might be inside an interrupt in the middle + * of a task switch. + */ + index = pgd_index(addr); + pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) { + no_context(regs, addr); + return; + } + set_pgd(pgd, *pgd_k); + + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + if (!p4d_present(*p4d_k)) { + no_context(regs, addr); + return; + } + + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); + if (!pud_present(*pud_k)) { + no_context(regs, addr); + return; + } + + /* + * Since the vmalloc area is global, it is unnecessary + * to copy individual PTEs + */ + pmd = pmd_offset(pud, addr); + pmd_k = pmd_offset(pud_k, addr); + if (!pmd_present(*pmd_k)) { + no_context(regs, addr); + return; + } + set_pmd(pmd, *pmd_k); + + /* + * Make sure the actual PTE exists as well to + * catch kernel vmalloc-area accesses to non-mapped + * addresses. If we don't do this, this will just + * silently loop forever. + */ + pte_k = pte_offset_kernel(pmd_k, addr); + if (!pte_present(*pte_k)) { + no_context(regs, addr); + return; + } + + /* + * The kernel assumes that TLBs don't cache invalid + * entries, but in RISC-V, SFENCE.VMA specifies an + * ordering constraint, not a cache flush; it is + * necessary even after writing invalid entries. + */ + local_flush_tlb_page(addr); +} + +static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) +{ + switch (cause) { + case EXC_INST_PAGE_FAULT: + if (!(vma->vm_flags & VM_EXEC)) { + return true; + } + break; + case EXC_LOAD_PAGE_FAULT: + if (!(vma->vm_flags & VM_READ)) { + return true; + } + break; + case EXC_STORE_PAGE_FAULT: + if (!(vma->vm_flags & VM_WRITE)) { + return true; + } + break; + default: + panic("%s: unhandled cause %lu", __func__, cause); + } + return false; +} + /* * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. @@ -48,8 +209,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * only copy the information from the master page table, * nothing more. */ - if (unlikely((addr >= VMALLOC_START) && (addr <= VMALLOC_END))) - goto vmalloc_fault; + if (unlikely((addr >= VMALLOC_START) && (addr <= VMALLOC_END))) { + vmalloc_fault(regs, code, addr); + return; + } /* Enable interrupts if they were enabled in the parent context. */ if (likely(regs->status & SR_PIE)) @@ -59,25 +222,37 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * If we're in an interrupt, have no user context, or are running * in an atomic region, then we must not take the fault. */ - if (unlikely(faulthandler_disabled() || !mm)) - goto no_context; + if (unlikely(faulthandler_disabled() || !mm)) { + no_context(regs, addr); + return; + } if (user_mode(regs)) flags |= FAULT_FLAG_USER; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + if (cause == EXC_STORE_PAGE_FAULT) + flags |= FAULT_FLAG_WRITE; + else if (cause == EXC_INST_PAGE_FAULT) + flags |= FAULT_FLAG_INSTRUCTION; retry: mmap_read_lock(mm); vma = find_vma(mm, addr); - if (unlikely(!vma)) - goto bad_area; + if (unlikely(!vma)) { + bad_area(regs, mm, code, addr); + return; + } if (likely(vma->vm_start <= addr)) goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - goto bad_area; - if (unlikely(expand_stack(vma, addr))) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, mm, code, addr); + return; + } + if (unlikely(expand_stack(vma, addr))) { + bad_area(regs, mm, code, addr); + return; + } /* * Ok, we have a good vm_area for this memory access, so @@ -86,22 +261,9 @@ retry: good_area: code = SEGV_ACCERR; - switch (cause) { - case EXC_INST_PAGE_FAULT: - if (!(vma->vm_flags & VM_EXEC)) - goto bad_area; - break; - case EXC_LOAD_PAGE_FAULT: - if (!(vma->vm_flags & VM_READ)) - goto bad_area; - break; - case EXC_STORE_PAGE_FAULT: - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - flags |= FAULT_FLAG_WRITE; - break; - default: - panic("%s: unhandled cause %lu", __func__, cause); + if (unlikely(access_error(cause, vma))) { + bad_area(regs, mm, code, addr); + return; } /* @@ -119,144 +281,22 @@ good_area: if (fault_signal_pending(fault, regs)) return; - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { + flags |= FAULT_FLAG_TRIED; - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - goto retry; - } + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + goto retry; } mmap_read_unlock(mm); - return; - /* - * Something tried to access memory that isn't in our memory map. - * Fix it, but check if it's kernel or user first. - */ -bad_area: - mmap_read_unlock(mm); - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs)) { - do_trap(regs, SIGSEGV, code, addr); + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, addr, fault); return; } - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - bust_spinlocks(1); - pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); - die(regs, "Oops"); - do_exit(SIGKILL); - - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ -out_of_memory: - mmap_read_unlock(mm); - if (!user_mode(regs)) - goto no_context; - pagefault_out_of_memory(); - return; - -do_sigbus: - mmap_read_unlock(mm); - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - goto no_context; - do_trap(regs, SIGBUS, BUS_ADRERR, addr); return; - -vmalloc_fault: - { - pgd_t *pgd, *pgd_k; - pud_t *pud, *pud_k; - p4d_t *p4d, *p4d_k; - pmd_t *pmd, *pmd_k; - pte_t *pte_k; - int index; - - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs)) - return do_trap(regs, SIGSEGV, code, addr); - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "tsk->active_mm->pgd" here. - * We might be inside an interrupt in the middle - * of a task switch. - */ - index = pgd_index(addr); - pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - goto no_context; - set_pgd(pgd, *pgd_k); - - p4d = p4d_offset(pgd, addr); - p4d_k = p4d_offset(pgd_k, addr); - if (!p4d_present(*p4d_k)) - goto no_context; - - pud = pud_offset(p4d, addr); - pud_k = pud_offset(p4d_k, addr); - if (!pud_present(*pud_k)) - goto no_context; - - /* - * Since the vmalloc area is global, it is unnecessary - * to copy individual PTEs - */ - pmd = pmd_offset(pud, addr); - pmd_k = pmd_offset(pud_k, addr); - if (!pmd_present(*pmd_k)) - goto no_context; - set_pmd(pmd, *pmd_k); - - /* - * Make sure the actual PTE exists as well to - * catch kernel vmalloc-area accesses to non-mapped - * addresses. If we don't do this, this will just - * silently loop forever. - */ - pte_k = pte_offset_kernel(pmd_k, addr); - if (!pte_present(*pte_k)) - goto no_context; - - /* - * The kernel assumes that TLBs don't cache invalid - * entries, but in RISC-V, SFENCE.VMA specifies an - * ordering constraint, not a cache flush; it is - * necessary even after writing invalid entries. - */ - local_flush_tlb_page(addr); - - return; - } } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 1dc89303b679..ea933b789a88 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -28,7 +28,18 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -void *dtb_early_va; +#define DTB_EARLY_BASE_VA PGDIR_SIZE +void *dtb_early_va __initdata; +uintptr_t dtb_early_pa __initdata; + +struct pt_alloc_ops { + pte_t *(*get_pte_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pte)(uintptr_t va); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t *(*get_pmd_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pmd)(uintptr_t va); +#endif +}; static void __init zone_sizes_init(void) { @@ -141,8 +152,6 @@ disable: } #endif /* CONFIG_BLK_DEV_INITRD */ -static phys_addr_t dtb_early_pa __initdata; - void __init setup_bootmem(void) { phys_addr_t mem_size = 0; @@ -194,6 +203,8 @@ void __init setup_bootmem(void) } #ifdef CONFIG_MMU +static struct pt_alloc_ops pt_ops; + unsigned long va_pa_offset; EXPORT_SYMBOL(va_pa_offset); unsigned long pfn_base; @@ -202,7 +213,6 @@ EXPORT_SYMBOL(pfn_base); pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; -static bool mmu_enabled; #define MAX_EARLY_MAPPING_SIZE SZ_128M @@ -224,27 +234,46 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) local_flush_tlb_page(addr); } -static pte_t *__init get_pte_virt(phys_addr_t pa) +static inline pte_t *__init get_pte_virt_early(phys_addr_t pa) { - if (mmu_enabled) { - clear_fixmap(FIX_PTE); - return (pte_t *)set_fixmap_offset(FIX_PTE, pa); - } else { - return (pte_t *)((uintptr_t)pa); - } + return (pte_t *)((uintptr_t)pa); } -static phys_addr_t __init alloc_pte(uintptr_t va) +static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PTE); + return (pte_t *)set_fixmap_offset(FIX_PTE, pa); +} + +static inline pte_t *get_pte_virt_late(phys_addr_t pa) +{ + return (pte_t *) __va(pa); +} + +static inline phys_addr_t __init alloc_pte_early(uintptr_t va) { /* * We only create PMD or PGD early mappings so we * should never reach here with MMU disabled. */ - BUG_ON(!mmu_enabled); + BUG(); +} +static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) +{ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } +static phys_addr_t alloc_pte_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + if (!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))) + BUG(); + return __pa(vaddr); +} + static void __init create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot) @@ -269,28 +298,46 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; #endif pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE); -static pmd_t *__init get_pmd_virt(phys_addr_t pa) +static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { - if (mmu_enabled) { - clear_fixmap(FIX_PMD); - return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); - } else { - return (pmd_t *)((uintptr_t)pa); - } + /* Before MMU is enabled */ + return (pmd_t *)((uintptr_t)pa); } -static phys_addr_t __init alloc_pmd(uintptr_t va) +static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) { - uintptr_t pmd_num; + clear_fixmap(FIX_PMD); + return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); +} - if (mmu_enabled) - return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +static pmd_t *get_pmd_virt_late(phys_addr_t pa) +{ + return (pmd_t *) __va(pa); +} + +static phys_addr_t __init alloc_pmd_early(uintptr_t va) +{ + uintptr_t pmd_num; pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; BUG_ON(pmd_num >= NUM_EARLY_PMDS); return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; } +static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pmd_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + static void __init create_pmd_mapping(pmd_t *pmdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot) @@ -306,34 +353,34 @@ static void __init create_pmd_mapping(pmd_t *pmdp, } if (pmd_none(pmdp[pmd_idx])) { - pte_phys = alloc_pte(va); + pte_phys = pt_ops.alloc_pte(va); pmdp[pmd_idx] = pfn_pmd(PFN_DOWN(pte_phys), PAGE_TABLE); - ptep = get_pte_virt(pte_phys); + ptep = pt_ops.get_pte_virt(pte_phys); memset(ptep, 0, PAGE_SIZE); } else { pte_phys = PFN_PHYS(_pmd_pfn(pmdp[pmd_idx])); - ptep = get_pte_virt(pte_phys); + ptep = pt_ops.get_pte_virt(pte_phys); } create_pte_mapping(ptep, va, pa, sz, prot); } #define pgd_next_t pmd_t -#define alloc_pgd_next(__va) alloc_pmd(__va) -#define get_pgd_next_virt(__pa) get_pmd_virt(__pa) +#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) +#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next fixmap_pmd #else #define pgd_next_t pte_t -#define alloc_pgd_next(__va) alloc_pte(__va) -#define get_pgd_next_virt(__pa) get_pte_virt(__pa) +#define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) +#define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next fixmap_pte #endif -static void __init create_pgd_mapping(pgd_t *pgdp, +void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot) { @@ -389,10 +436,13 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) asmlinkage void __init setup_vm(uintptr_t dtb_pa) { - uintptr_t va, end_va; + uintptr_t va, pa, end_va; uintptr_t load_pa = (uintptr_t)(&_start); uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t fix_bmap_spmd, fix_bmap_epmd; +#endif va_pa_offset = PAGE_OFFSET - load_pa; pfn_base = PFN_DOWN(load_pa); @@ -408,6 +458,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((load_pa % map_size) != 0); BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE); + pt_ops.alloc_pte = alloc_pte_early; + pt_ops.get_pte_virt = get_pte_virt_early; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_early; + pt_ops.get_pmd_virt = get_pmd_virt_early; +#endif /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); @@ -438,17 +494,44 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) load_pa + (va - PAGE_OFFSET), map_size, PAGE_KERNEL_EXEC); - /* Create fixed mapping for early FDT parsing */ - end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; - for (va = __fix_to_virt(FIX_FDT); va < end_va; va += PAGE_SIZE) - create_pte_mapping(fixmap_pte, va, - dtb_pa + (va - __fix_to_virt(FIX_FDT)), - PAGE_SIZE, PAGE_KERNEL); - - /* Save pointer to DTB for early FDT parsing */ - dtb_early_va = (void *)fix_to_virt(FIX_FDT) + (dtb_pa & ~PAGE_MASK); - /* Save physical address for memblock reservation */ + /* Create two consecutive PGD mappings for FDT early scan */ + pa = dtb_pa & ~(PGDIR_SIZE - 1); + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, + pa, PGDIR_SIZE, PAGE_KERNEL); + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA + PGDIR_SIZE, + pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); + dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); dtb_early_pa = dtb_pa; + + /* + * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap + * range can not span multiple pmds. + */ + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); + +#ifndef __PAGETABLE_PMD_FOLDED + /* + * Early ioremap fixmap is already created as it lies within first 2MB + * of fixmap region. We always map PMD_SIZE. Thus, both FIX_BTMAP_END + * FIX_BTMAP_BEGIN should lie in the same pmd. Verify that and warn + * the user if not. + */ + fix_bmap_spmd = fixmap_pmd[pmd_index(__fix_to_virt(FIX_BTMAP_BEGIN))]; + fix_bmap_epmd = fixmap_pmd[pmd_index(__fix_to_virt(FIX_BTMAP_END))]; + if (pmd_val(fix_bmap_spmd) != pmd_val(fix_bmap_epmd)) { + WARN_ON(1); + pr_warn("fixmap btmap start [%08lx] != end [%08lx]\n", + pmd_val(fix_bmap_spmd), pmd_val(fix_bmap_epmd)); + pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN); + } +#endif } static void __init setup_vm_final(void) @@ -457,9 +540,16 @@ static void __init setup_vm_final(void) phys_addr_t pa, start, end; u64 i; - /* Set mmu_enabled flag */ - mmu_enabled = true; - + /** + * MMU is enabled at this point. But page table setup is not complete yet. + * fixmap page table alloc functions should be used at this point + */ + pt_ops.alloc_pte = alloc_pte_fixmap; + pt_ops.get_pte_virt = get_pte_virt_fixmap; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_fixmap; + pt_ops.get_pmd_virt = get_pmd_virt_fixmap; +#endif /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), @@ -488,6 +578,14 @@ static void __init setup_vm_final(void) /* Move to swapper page table */ csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); + + /* generic page allocation functions must be used to setup page table */ + pt_ops.alloc_pte = alloc_pte_late; + pt_ops.get_pte_virt = get_pte_virt_late; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_late; + pt_ops.get_pmd_virt = get_pmd_virt_late; +#endif } #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 0831c2e61a8f..ace74dec7492 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -3,6 +3,7 @@ * Copyright (C) 2019 SiFive */ +#include <linux/efi.h> #include <linux/init.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -49,6 +50,14 @@ struct addr_marker { const char *name; }; +/* Private information for debugfs */ +struct ptd_mm_info { + struct mm_struct *mm; + const struct addr_marker *markers; + unsigned long base_addr; + unsigned long end; +}; + static struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN {KASAN_SHADOW_START, "Kasan shadow start"}, @@ -68,6 +77,28 @@ static struct addr_marker address_markers[] = { {-1, NULL}, }; +static struct ptd_mm_info kernel_ptd_info = { + .mm = &init_mm, + .markers = address_markers, + .base_addr = KERN_VIRT_START, + .end = ULONG_MAX, +}; + +#ifdef CONFIG_EFI +static struct addr_marker efi_addr_markers[] = { + { 0, "UEFI runtime start" }, + { SZ_1G, "UEFI runtime end" }, + { -1, NULL } +}; + +static struct ptd_mm_info efi_ptd_info = { + .mm = &efi_mm, + .markers = efi_addr_markers, + .base_addr = 0, + .end = SZ_2G, +}; +#endif + /* Page Table Entry */ struct prot_bits { u64 mask; @@ -245,22 +276,22 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, } } -static void ptdump_walk(struct seq_file *s) +static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) { struct pg_state st = { .seq = s, - .marker = address_markers, + .marker = pinfo->markers, .level = -1, .ptdump = { .note_page = note_page, .range = (struct ptdump_range[]) { - {KERN_VIRT_START, ULONG_MAX}, + {pinfo->base_addr, pinfo->end}, {0, 0} } } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL); } void ptdump_check_wx(void) @@ -293,7 +324,7 @@ void ptdump_check_wx(void) static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk(m); + ptdump_walk(m, m->private); return 0; } @@ -308,8 +339,13 @@ static int ptdump_init(void) for (j = 0; j < ARRAY_SIZE(pte_bits); j++) pg_level[i].mask |= pte_bits[j].mask; - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, + debugfs_create_file("kernel_page_tables", 0400, NULL, &kernel_ptd_info, &ptdump_fops); +#ifdef CONFIG_EFI + if (efi_enabled(EFI_RUNTIME_SERVICES)) + debugfs_create_file("efi_page_tables", 0400, NULL, &efi_ptd_info, + &ptdump_fops); +#endif return 0; } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d509bf23ef78..4a2a12be04c9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -60,6 +60,7 @@ config S390 def_bool y select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE @@ -73,6 +74,7 @@ config S390 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VDSO_DATA select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH @@ -118,6 +120,8 @@ config S390 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_FIND_FIRST_BIT + select GENERIC_GETTIMEOFDAY + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HAVE_ALIGNED_STRUCT_PAGE if SLUB @@ -149,6 +153,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS + select HAVE_GENERIC_VDSO select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -186,6 +191,7 @@ config S390 select PCI_DOMAINS if PCI select PCI_MSI if PCI select PCI_MSI_ARCH_FALLBACKS if PCI_MSI + select SET_FS select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK @@ -804,6 +810,7 @@ menu "Virtualization" config PROTECTED_VIRTUALIZATION_GUEST def_bool n prompt "Protected virtualization guest support" + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS help Select this option, if you want to be able to run this kernel as a protected virtualization KVM guest. diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index 761fe2b0b2f6..ab48b694ade8 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -3,17 +3,5 @@ config TRACE_IRQFLAGS_SUPPORT def_bool y -config S390_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL - select DEBUG_FS - help - Say Y here if you want to show the kernel pagetable layout in a - debugfs file. This information is only useful for kernel developers - who are working in architecture specific areas of the kernel. - It is probably not a good idea to enable this feature in a production - kernel. - If in doubt, say "N" - config EARLY_PRINTK def_bool y diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 45b33b83de08..41a64b8dce25 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -73,7 +73,3 @@ $(obj)/startup.a: $(OBJECTS) FORCE install: sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ System.map "$(INSTALL_PATH)" - -chkbss := $(obj-y) -chkbss-target := startup.a -include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index fa529c5b4486..b235ed95a3d8 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -62,7 +62,3 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed $(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE $(call if_changed,objcopy) - -chkbss := $(filter-out piggy.o info.o, $(obj-y)) -chkbss-target := vmlinux.bin -include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/boot/compressed/decompressor.c b/arch/s390/boot/compressed/decompressor.c index 368fd372c875..3061b11c4d27 100644 --- a/arch/s390/boot/compressed/decompressor.c +++ b/arch/s390/boot/compressed/decompressor.c @@ -16,7 +16,6 @@ * gzip declarations */ #define STATIC static -#define STATIC_RW_DATA static __section(.data) #undef memset #undef memcpy diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S index 44561b2c3712..9427e2cd0c15 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/compressed/vmlinux.lds.S @@ -59,6 +59,19 @@ SECTIONS BOOT_DATA_PRESERVED /* + * This is the BSS section of the decompressor and not of the decompressed Linux kernel. + * It will consume place in the decompressor's image. + */ + . = ALIGN(8); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + _ebss = .; + } + + /* * uncompressed image info used by the decompressor it should match * struct vmlinux_info. It comes from .vmlinux.info section of * uncompressed vmlinux in a form of info.o @@ -81,15 +94,6 @@ SECTIONS FILL(0xff); . = ALIGN(4096); } - . = ALIGN(256); - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - . = ALIGN(8); /* For convenience during zeroing */ - _ebss = .; - } _end = .; /* Sections to be discarded */ diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index dae10961d072..1a2c2b1ed964 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -360,22 +360,23 @@ ENTRY(startup_kdump) # the save area and does disabled wait with a faulty address. # ENTRY(startup_pgm_check_handler) - stmg %r0,%r15,__LC_SAVE_AREA_SYNC - la %r1,4095 - stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r1) - mvc __LC_GPREGS_SAVE_AREA-4095(128,%r1),__LC_SAVE_AREA_SYNC - mvc __LC_PSW_SAVE_AREA-4095(16,%r1),__LC_PGM_OLD_PSW + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + la %r8,4095 + stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8) + stmg %r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8) + mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA_SYNC + mvc __LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW mvc __LC_RETURN_PSW(16),__LC_PGM_OLD_PSW ni __LC_RETURN_PSW,0xfc # remove IO and EX bits ni __LC_RETURN_PSW+1,0xfb # remove MCHK bit oi __LC_RETURN_PSW+1,0x2 # set wait state bit - larl %r2,.Lold_psw_disabled_wait - stg %r2,__LC_PGM_NEW_PSW+8 - l %r15,.Ldump_info_stack-.Lold_psw_disabled_wait(%r2) + larl %r9,.Lold_psw_disabled_wait + stg %r9,__LC_PGM_NEW_PSW+8 + l %r15,.Ldump_info_stack-.Lold_psw_disabled_wait(%r9) brasl %r14,print_pgm_check_info .Lold_psw_disabled_wait: - la %r1,4095 - lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) + la %r8,4095 + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8) lpswe __LC_RETURN_PSW # disabled wait .Ldump_info_stack: .long 0x5000 + PAGE_SIZE - STACK_FRAME_OVERHEAD diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 8e222a666025..f94b91d72620 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -21,7 +21,7 @@ unsigned long __bootdata(memory_end); int __bootdata(memory_end_set); int __bootdata(noexec_disabled); -int kaslr_enabled __section(.data); +int kaslr_enabled; static inline int __diag308(unsigned long subcode, void *addr) { @@ -70,30 +70,44 @@ static size_t scpdata_length(const u8 *buf, size_t count) static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, const struct ipl_parameter_block *ipb) { - size_t count; - size_t i; + const __u8 *scp_data; + __u32 scp_data_len; int has_lowercase; + size_t count = 0; + size_t i; + + switch (ipb->pb0_hdr.pbt) { + case IPL_PBT_FCP: + scp_data_len = ipb->fcp.scp_data_len; + scp_data = ipb->fcp.scp_data; + break; + case IPL_PBT_NVME: + scp_data_len = ipb->nvme.scp_data_len; + scp_data = ipb->nvme.scp_data; + break; + default: + goto out; + } - count = min(size - 1, scpdata_length(ipb->fcp.scp_data, - ipb->fcp.scp_data_len)); + count = min(size - 1, scpdata_length(scp_data, scp_data_len)); if (!count) goto out; has_lowercase = 0; for (i = 0; i < count; i++) { - if (!isascii(ipb->fcp.scp_data[i])) { + if (!isascii(scp_data[i])) { count = 0; goto out; } - if (!has_lowercase && islower(ipb->fcp.scp_data[i])) + if (!has_lowercase && islower(scp_data[i])) has_lowercase = 1; } if (has_lowercase) - memcpy(dest, ipb->fcp.scp_data, count); + memcpy(dest, scp_data, count); else for (i = 0; i < count; i++) - dest[i] = tolower(ipb->fcp.scp_data[i]); + dest[i] = tolower(scp_data[i]); out: dest[count] = '\0'; return count; @@ -115,6 +129,7 @@ static void append_ipl_block_parm(void) parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; case IPL_PBT_FCP: + case IPL_PBT_NVME: rc = ipl_block_get_ascii_scpdata( parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; @@ -209,7 +224,7 @@ static void modify_fac_list(char *str) check_cleared_facilities(); } -static char command_line_buf[COMMAND_LINE_SIZE] __section(.data); +static char command_line_buf[COMMAND_LINE_SIZE]; void parse_boot_command_line(void) { char *param, *val; @@ -230,7 +245,7 @@ void parse_boot_command_line(void) if (!strcmp(param, "vmalloc") && val) vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE); - if (!strcmp(param, "dfltcc")) { + if (!strcmp(param, "dfltcc") && val) { if (!strcmp(val, "off")) zlib_dfltcc_support = ZLIB_DFLTCC_DISABLED; else if (!strcmp(val, "on")) @@ -254,17 +269,34 @@ void parse_boot_command_line(void) if (!strcmp(param, "nokaslr")) kaslr_enabled = 0; + +#if IS_ENABLED(CONFIG_KVM) + if (!strcmp(param, "prot_virt")) { + rc = kstrtobool(val, &enabled); + if (!rc && enabled) + prot_virt_host = 1; + } +#endif } } +static inline bool is_ipl_block_dump(void) +{ + if (ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) + return true; + if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME && + ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) + return true; + return false; +} + void setup_memory_end(void) { #ifdef CONFIG_CRASH_DUMP if (OLDMEM_BASE) { kaslr_enabled = 0; - } else if (ipl_block_valid && - ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && - ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { + } else if (ipl_block_valid && is_ipl_block_dump()) { kaslr_enabled = 0; if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index d4442163ffa9..d844a5ef9089 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -42,7 +42,7 @@ static int check_prng(void) return PRNG_MODE_TDES; } -static unsigned long get_random(unsigned long limit) +static int get_random(unsigned long limit, unsigned long *value) { struct prng_parm prng = { /* initial parameter block for tdes mode, copied from libica */ @@ -84,19 +84,101 @@ static unsigned long get_random(unsigned long limit) (u8 *) &random, sizeof(random)); break; default: - random = 0; + return -1; } - return random % limit; + *value = random % limit; + return 0; +} + +/* + * To randomize kernel base address we have to consider several facts: + * 1. physical online memory might not be continuous and have holes. mem_detect + * info contains list of online memory ranges we should consider. + * 2. we have several memory regions which are occupied and we should not + * overlap and destroy them. Currently safe_addr tells us the border below + * which all those occupied regions are. We are safe to use anything above + * safe_addr. + * 3. the upper limit might apply as well, even if memory above that limit is + * online. Currently those limitations are: + * 3.1. Limit set by "mem=" kernel command line option + * 3.2. memory reserved at the end for kasan initialization. + * 4. kernel base address must be aligned to THREAD_SIZE (kernel stack size). + * Which is required for CONFIG_CHECK_STACK. Currently THREAD_SIZE is 4 pages + * (16 pages when the kernel is built with kasan enabled) + * Assumptions: + * 1. kernel size (including .bss size) and upper memory limit are page aligned. + * 2. mem_detect memory region start is THREAD_SIZE aligned / end is PAGE_SIZE + * aligned (in practice memory configurations granularity on z/VM and LPAR + * is 1mb). + * + * To guarantee uniform distribution of kernel base address among all suitable + * addresses we generate random value just once. For that we need to build a + * continuous range in which every value would be suitable. We can build this + * range by simply counting all suitable addresses (let's call them positions) + * which would be valid as kernel base address. To count positions we iterate + * over online memory ranges. For each range which is big enough for the + * kernel image we count all suitable addresses we can put the kernel image at + * that is + * (end - start - kernel_size) / THREAD_SIZE + 1 + * Two functions count_valid_kernel_positions and position_to_address help + * to count positions in memory range given and then convert position back + * to address. + */ +static unsigned long count_valid_kernel_positions(unsigned long kernel_size, + unsigned long _min, + unsigned long _max) +{ + unsigned long start, end, pos = 0; + int i; + + for_each_mem_detect_block(i, &start, &end) { + if (_min >= end) + continue; + if (start >= _max) + break; + start = max(_min, start); + end = min(_max, end); + if (end - start < kernel_size) + continue; + pos += (end - start - kernel_size) / THREAD_SIZE + 1; + } + + return pos; +} + +static unsigned long position_to_address(unsigned long pos, unsigned long kernel_size, + unsigned long _min, unsigned long _max) +{ + unsigned long start, end; + int i; + + for_each_mem_detect_block(i, &start, &end) { + if (_min >= end) + continue; + if (start >= _max) + break; + start = max(_min, start); + end = min(_max, end); + if (end - start < kernel_size) + continue; + if ((end - start - kernel_size) / THREAD_SIZE + 1 >= pos) + return start + (pos - 1) * THREAD_SIZE; + pos -= (end - start - kernel_size) / THREAD_SIZE + 1; + } + + return 0; } unsigned long get_random_base(unsigned long safe_addr) { - unsigned long memory_limit = memory_end_set ? memory_end : 0; - unsigned long base, start, end, kernel_size; - unsigned long block_sum, offset; + unsigned long memory_limit = get_mem_detect_end(); + unsigned long base_pos, max_pos, kernel_size; unsigned long kasan_needs; int i; + if (memory_end_set) + memory_limit = min(memory_limit, memory_end); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) { if (safe_addr < INITRD_START + INITRD_SIZE) safe_addr = INITRD_START + INITRD_SIZE; @@ -126,45 +208,17 @@ unsigned long get_random_base(unsigned long safe_addr) } kernel_size = vmlinux.image_size + vmlinux.bss_size; - block_sum = 0; - for_each_mem_detect_block(i, &start, &end) { - if (memory_limit) { - if (start >= memory_limit) - break; - if (end > memory_limit) - end = memory_limit; - } - if (end - start < kernel_size) - continue; - block_sum += end - start - kernel_size; - } - if (!block_sum) { + if (safe_addr + kernel_size > memory_limit) + return 0; + + max_pos = count_valid_kernel_positions(kernel_size, safe_addr, memory_limit); + if (!max_pos) { sclp_early_printk("KASLR disabled: not enough memory\n"); return 0; } - base = get_random(block_sum); - if (base == 0) + /* we need a value in the range [1, base_pos] inclusive */ + if (get_random(max_pos, &base_pos)) return 0; - if (base < safe_addr) - base = safe_addr; - block_sum = offset = 0; - for_each_mem_detect_block(i, &start, &end) { - if (memory_limit) { - if (start >= memory_limit) - break; - if (end > memory_limit) - end = memory_limit; - } - if (end - start < kernel_size) - continue; - block_sum += end - start - kernel_size; - if (base <= block_sum) { - base = start + base - offset; - base = ALIGN_DOWN(base, THREAD_SIZE); - break; - } - offset = block_sum; - } - return base; + return position_to_address(base_pos + 1, kernel_size, safe_addr, memory_limit); } diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 83b5b7915c32..a3c9862bcede 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <asm/lowcore.h> +#include <asm/setup.h> #include <asm/sclp.h> #include "boot.h" @@ -32,7 +33,8 @@ void print_pgm_check_info(void) char *p; add_str(buf, "Linux version "); - strlcat(buf, kernel_version, sizeof(buf)); + strlcat(buf, kernel_version, sizeof(buf) - 1); + strlcat(buf, "\n", sizeof(buf)); sclp_early_printk(buf); p = add_str(buf, "Kernel fault: interruption code "); @@ -42,6 +44,13 @@ void print_pgm_check_info(void) add_str(p, "\n"); sclp_early_printk(buf); + if (kaslr_enabled) { + p = add_str(buf, "Kernel random base: "); + p = add_val_as_hex(p, __kaslr_offset); + add_str(p, "\n"); + sclp_early_printk(buf); + } + p = add_str(buf, "PSW : "); p = add_val_as_hex(p, S390_lowcore.psw_save_area.mask); p = add_str(p, " "); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 3b3a11f95269..90842936545b 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -48,8 +48,6 @@ struct diag_ops __bootdata_preserved(diag_dma_ops) = { }; static struct diag210 _diag210_tmp_dma __section(.dma.data); struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma; -void _swsusp_reset_dma(void); -unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma); void error(char *x) { @@ -120,6 +118,9 @@ static void handle_relocs(unsigned long offset) } } +/* + * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's. + */ static void clear_bss_section(void) { memset((void *)vmlinux.default_lma + vmlinux.image_size, 0, vmlinux.bss_size); diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S index 9715715c4c28..f7c77cd518f2 100644 --- a/arch/s390/boot/text_dma.S +++ b/arch/s390/boot/text_dma.S @@ -97,23 +97,6 @@ ENTRY(_diag0c_dma) ENDPROC(_diag0c_dma) /* - * void _swsusp_reset_dma(void) - */ -ENTRY(_swsusp_reset_dma) - larl %r1,restart_entry - larl %r2,.Lrestart_diag308_psw - og %r1,0(%r2) - stg %r1,0(%r0) - lghi %r0,0 - diag %r0,%r0,0x308 -restart_entry: - lhi %r1,1 - sigp %r1,%r0,SIGP_SET_ARCHITECTURE - sam64 - BR_EX_DMA_r14 -ENDPROC(_swsusp_reset_dma) - -/* * void _diag308_reset_dma(void) * * Calls diag 308 subcode 1 and continues execution diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index f887a479cdc7..a15c033f53ca 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -7,6 +7,9 @@ #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); #endif +#if IS_ENABLED(CONFIG_KVM) +int __bootdata_preserved(prot_virt_host); +#endif struct uv_info __bootdata_preserved(uv_info); void uv_query_info(void) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 7228aabe9da6..0784bf3caf43 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -775,6 +775,8 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_PAGEALLOC=y CONFIG_PAGE_OWNER=y CONFIG_DEBUG_RODATA_TEST=y +CONFIG_DEBUG_WX=y +CONFIG_PTDUMP_DEBUGFS=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_SELFTEST=y CONFIG_DEBUG_OBJECTS_FREE=y @@ -822,7 +824,6 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y -CONFIG_S390_PTDUMP=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index fab03b7a6932..905bc8c4cfaf 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -759,6 +759,8 @@ CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_WX=y +CONFIG_PTDUMP_DEBUGFS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_PANIC_ON_OOPS=y CONFIG_TEST_LOCKUP=m @@ -775,7 +777,6 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y -CONFIG_S390_PTDUMP=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index 3cfe1eb89838..c0be5fe1ddba 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -238,7 +238,10 @@ extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int); u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx); int ccw_device_pnso(struct ccw_device *cdev, - struct chsc_pnso_area *pnso_area, - struct chsc_pnso_resume_token resume_token, - int cnc); + struct chsc_pnso_area *pnso_area, u8 oc, + struct chsc_pnso_resume_token resume_token, int cnc); +int ccw_device_get_cssid(struct ccw_device *cdev, u8 *cssid); +int ccw_device_get_iid(struct ccw_device *cdev, u8 *iid); +int ccw_device_get_chpid(struct ccw_device *cdev, int chp_idx, u8 *chpid); +int ccw_device_get_chid(struct ccw_device *cdev, int chp_idx, u16 *chid); #endif /* _S390_CCWDEV_H_ */ diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index 6813bfa1eeb7..a8c02cfbc712 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -13,21 +13,21 @@ #define _S390_CHECKSUM_H #include <linux/uaccess.h> +#include <linux/in6.h> /* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic + * Returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic. * - * this function must be called with even lengths, except - * for the last fragment, which may be odd + * This function must be called with even lengths, except + * for the last fragment, which may be odd. * - * it's best to have buff aligned on a 32-bit boundary + * It's best to have buff aligned on a 32-bit boundary. */ -static inline __wsum -csum_partial(const void *buff, int len, __wsum sum) +static inline __wsum csum_partial(const void *buff, int len, __wsum sum) { register unsigned long reg2 asm("2") = (unsigned long) buff; register unsigned long reg3 asm("3") = (unsigned long) len; @@ -40,74 +40,91 @@ csum_partial(const void *buff, int len, __wsum sum) } /* - * Fold a partial checksum without adding pseudo headers + * Fold a partial checksum without adding pseudo headers. */ static inline __sum16 csum_fold(__wsum sum) { u32 csum = (__force u32) sum; - csum += (csum >> 16) + (csum << 16); + csum += (csum >> 16) | (csum << 16); csum >>= 16; return (__force __sum16) ~csum; } /* - * This is a version of ip_compute_csum() optimized for IP headers, - * which always checksum on 4 octet boundaries. - * + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksums on 4 octet boundaries. */ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { - return csum_fold(csum_partial(iph, ihl*4, 0)); + __u64 csum = 0; + __u32 *ptr = (u32 *)iph; + + csum += *ptr++; + csum += *ptr++; + csum += *ptr++; + csum += *ptr++; + ihl -= 4; + while (ihl--) + csum += *ptr++; + csum += (csum >> 32) | (csum << 32); + return csum_fold((__force __wsum)(csum >> 32)); } /* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 32-bit checksum + * Computes the checksum of the TCP/UDP pseudo-header. + * Returns a 32-bit checksum. */ -static inline __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, - __wsum sum) +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { - __u32 csum = (__force __u32)sum; + __u64 csum = (__force __u64)sum; csum += (__force __u32)saddr; - if (csum < (__force __u32)saddr) - csum++; - csum += (__force __u32)daddr; - if (csum < (__force __u32)daddr) - csum++; - - csum += len + proto; - if (csum < len + proto) - csum++; - - return (__force __wsum)csum; + csum += len; + csum += proto; + csum += (csum >> 32) | (csum << 32); + return (__force __wsum)(csum >> 32); } /* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 16-bit checksum, already complemented + * Computes the checksum of the TCP/UDP pseudo-header. + * Returns a 16-bit checksum, already complemented. */ - -static inline __sum16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, - __wsum sum) +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { - return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } /* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c + * Used for miscellaneous IP-like checksums, mainly icmp. */ - static inline __sum16 ip_compute_csum(const void *buff, int len) { return csum_fold(csum_partial(buff, len, 0)); } -#endif /* _S390_CHECKSUM_H */ - +#define _HAVE_ARCH_IPV6_CSUM +static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + __u64 sum = (__force __u64)csum; + + sum += (__force __u32)saddr->s6_addr32[0]; + sum += (__force __u32)saddr->s6_addr32[1]; + sum += (__force __u32)saddr->s6_addr32[2]; + sum += (__force __u32)saddr->s6_addr32[3]; + sum += (__force __u32)daddr->s6_addr32[0]; + sum += (__force __u32)daddr->s6_addr32[1]; + sum += (__force __u32)daddr->s6_addr32[2]; + sum += (__force __u32)daddr->s6_addr32[3]; + sum += len; + sum += proto; + sum += (sum >> 32) | (sum << 32); + return csum_fold((__force __wsum)(sum >> 32)); +} +#endif /* _S390_CHECKSUM_H */ diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h index 36ce2d25a5fc..ae4d2549cd67 100644 --- a/arch/s390/include/asm/chsc.h +++ b/arch/s390/include/asm/chsc.h @@ -12,6 +12,13 @@ #include <uapi/asm/chsc.h> /** + * Operation codes for CHSC PNSO: + * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport + * PNSO_OC_NET_ADDR_INFO - all addresses + */ +#define PNSO_OC_NET_BRIDGE_INFO 0 +#define PNSO_OC_NET_ADDR_INFO 3 +/** * struct chsc_pnso_naid_l2 - network address information descriptor * @nit: Network interface token * @addr_lnid: network address and logical network id (VLAN ID) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index b5bfb3123cb1..5c58756d6476 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -356,7 +356,6 @@ static inline u8 pathmask_to_pos(u8 mask) return 8 - ffs(mask); } -void channel_subsystem_reinit(void); extern void css_schedule_reprobe(void); extern void *cio_dma_zalloc(size_t size); @@ -372,6 +371,7 @@ struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); +int chsc_stzi(void *page, void *result, size_t size); int chsc_sgib(u32 origin); #endif diff --git a/arch/s390/include/asm/clocksource.h b/arch/s390/include/asm/clocksource.h new file mode 100644 index 000000000000..03434369fce4 --- /dev/null +++ b/arch/s390/include/asm/clocksource.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* s390-specific clocksource additions */ + +#ifndef _ASM_S390_CLOCKSOURCE_H +#define _ASM_S390_CLOCKSOURCE_H + +#endif /* _ASM_S390_CLOCKSOURCE_H */ diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h index 3925b0f085b7..10919eeb7533 100644 --- a/arch/s390/include/asm/clp.h +++ b/arch/s390/include/asm/clp.h @@ -5,6 +5,9 @@ /* CLP common request & response block size */ #define CLP_BLK_SIZE PAGE_SIZE +/* Call Logical Processor - Command Code */ +#define CLP_SLPC 0x0001 + #define CLP_LPS_BASE 0 #define CLP_LPS_PCI 2 diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h index 480bb02ccacd..638137d46c85 100644 --- a/arch/s390/include/asm/css_chars.h +++ b/arch/s390/include/asm/css_chars.h @@ -36,7 +36,9 @@ struct css_general_char { u64 alt_ssi : 1; /* bit 108 */ u64 : 1; u64 narf : 1; /* bit 110 */ - u64 : 12; + u64 : 5; + u64 enarf: 1; /* bit 116 */ + u64 : 6; u64 util_str : 1;/* bit 123 */ } __packed; diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index a816fb4734b8..40264f60b0da 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -140,8 +140,6 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); -void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, - unsigned long bits); int gmap_mprotect_notify(struct gmap *, unsigned long start, unsigned long len, int prot); diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index da014e4f8113..28664ee0abc1 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <asm/page.h> +#include <asm/pgtable.h> #include <asm/pci_io.h> #define xlate_dev_mem_ptr xlate_dev_mem_ptr @@ -26,7 +27,10 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); void __iomem *ioremap(phys_addr_t addr, size_t size); +void __iomem *ioremap_wc(phys_addr_t addr, size_t size); +void __iomem *ioremap_wt(phys_addr_t addr, size_t size); void iounmap(volatile void __iomem *addr); static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) @@ -52,6 +56,10 @@ static inline void ioport_unmap(void __iomem *p) #define pci_iomap_wc pci_iomap_wc #define pci_iomap_wc_range pci_iomap_wc_range +#define ioremap ioremap +#define ioremap_wt ioremap_wt +#define ioremap_wc ioremap_wc + #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 7d5cfdda5277..a9e2c7295b35 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -66,6 +66,7 @@ enum ipl_type { IPL_TYPE_FCP_DUMP = 8, IPL_TYPE_NSS = 16, IPL_TYPE_NVME = 32, + IPL_TYPE_NVME_DUMP = 64, }; struct ipl_info @@ -94,6 +95,12 @@ extern struct ipl_info ipl_info; extern void setup_ipl(void); extern void set_os_info_reipl_block(void); +static inline bool is_ipl_type_dump(void) +{ + return (ipl_info.type == IPL_TYPE_FCP_DUMP) || + (ipl_info.type == IPL_TYPE_NVME_DUMP); +} + struct ipl_report { struct ipl_parameter_block *ipib; struct list_head components; diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h index 89d6886040c8..e9bf486de136 100644 --- a/arch/s390/include/asm/kasan.h +++ b/arch/s390/include/asm/kasan.h @@ -19,6 +19,7 @@ extern void kasan_early_init(void); extern void kasan_copy_shadow(pgd_t *dst); extern void kasan_free_early_identity(void); +extern unsigned long kasan_vmax; #else static inline void kasan_early_init(void) { } static inline void kasan_copy_shadow(pgd_t *dst) { } diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 99b92c3e46b0..212628932ddc 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -132,7 +132,8 @@ struct zpci_dev { u8 rid_available : 1; u8 has_hp_slot : 1; u8 is_physfn : 1; - u8 reserved : 5; + u8 util_str_avail : 1; + u8 reserved : 4; unsigned int devfn; /* DEVFN part of the RID*/ struct mutex lock; @@ -179,6 +180,7 @@ struct zpci_dev { atomic64_t mapped_pages; atomic64_t unmapped_pages; + u8 version; enum pci_bus_speed max_bus_speed; struct dentry *debugfs_dev; @@ -208,9 +210,8 @@ int zpci_unregister_ioat(struct zpci_dev *, u8); void zpci_remove_reserved_devices(void); /* CLP */ +int clp_setup_writeback_mio(void); int clp_scan_pci_devices(void); -int clp_rescan_pci_devices(void); -int clp_rescan_pci_devices_simple(u32 *fid); int clp_add_pci_device(u32, u32, int); int clp_enable_fh(struct zpci_dev *, u8); int clp_disable_fh(struct zpci_dev *); @@ -232,12 +233,10 @@ static inline bool zpci_use_mio(struct zpci_dev *zdev) /* Error handling and recovery */ void zpci_event_error(void *); void zpci_event_availability(void *); -void zpci_rescan(void); bool zpci_is_enabled(void); #else /* CONFIG_PCI */ static inline void zpci_event_error(void *e) {} static inline void zpci_event_availability(void *e) {} -static inline void zpci_rescan(void) {} #endif /* CONFIG_PCI */ #ifdef CONFIG_HOTPLUG_PCI_S390 @@ -282,7 +281,6 @@ int zpci_debug_init(void); void zpci_debug_exit(void); void zpci_debug_init_device(struct zpci_dev *, const char *); void zpci_debug_exit_device(struct zpci_dev *); -void zpci_debug_info(struct zpci_dev *, struct seq_file *); /* Error reporting */ int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *); diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index eb51272dd2cc..1f4b666e85ee 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -7,6 +7,7 @@ /* * Call Logical Processor - Command Codes */ +#define CLP_SLPC 0x0001 #define CLP_LIST_PCI 0x0002 #define CLP_QUERY_PCI_FN 0x0003 #define CLP_QUERY_PCI_FNGRP 0x0004 @@ -51,6 +52,19 @@ struct clp_fh_list_entry { extern bool zpci_unique_uid; +struct clp_rsp_slpc_pci { + struct clp_rsp_hdr hdr; + u32 reserved2[4]; + u32 lpif[8]; + u32 reserved3[4]; + u32 vwb : 1; + u32 : 1; + u32 mio_wb : 6; + u32 : 24; + u32 reserved5[3]; + u32 lpic[8]; +} __packed; + /* List PCI functions request */ struct clp_req_list_pci { struct clp_req_hdr hdr; @@ -172,6 +186,11 @@ struct clp_rsp_set_pci { } __packed; /* Combined request/response block structures used by clp insn */ +struct clp_req_rsp_slpc_pci { + struct clp_req_slpc request; + struct clp_rsp_slpc_pci response; +} __packed; + struct clp_req_rsp_list_pci { struct clp_req_list_pci request; struct clp_rsp_list_pci response; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 74a352f8c0d1..d1297d6bbdcf 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -146,8 +146,6 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) -extern void rcu_table_freelist_finish(void); - void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); pte_t *vmem_pte_alloc(void); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b55561cc8786..6b8d8c69b1a1 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -89,6 +89,7 @@ extern unsigned long VMALLOC_START; extern unsigned long VMALLOC_END; #define VMALLOC_DEFAULT_SIZE ((128UL << 30) - MODULES_LEN) extern struct page *vmemmap; +extern unsigned long vmemmap_size; #define VMEM_MAX_PHYS ((unsigned long) vmemmap) @@ -1186,6 +1187,12 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); +#define pgprot_writecombine pgprot_writecombine +pgprot_t pgprot_writecombine(pgprot_t prot); + +#define pgprot_writethrough pgprot_writethrough +pgprot_t pgprot_writethrough(pgprot_t prot); + /* * Certain architectures need to do special things when PTEs * within a page table are directly modified. Thus, the following @@ -1209,7 +1216,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) { pte_t __pte; - pte_val(__pte) = physpage + pgprot_val(pgprot); + + pte_val(__pte) = physpage | pgprot_val(pgprot); if (!MACHINE_HAS_NX) pte_val(__pte) &= ~_PAGE_NOEXEC; return pte_mkyoung(__pte); diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h new file mode 100644 index 000000000000..f960b2896606 --- /dev/null +++ b/arch/s390/include/asm/ptdump.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_PTDUMP_H +#define _ASM_S390_PTDUMP_H + +void ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} + +#endif /* _ASM_S390_PTDUMP_H */ diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index e69dbf438f99..19e84c95d1e7 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -26,9 +26,9 @@ /** * struct qdesfmt0 - queue descriptor, format 0 - * @sliba: storage list information block address - * @sla: storage list address - * @slsba: storage list state block address + * @sliba: absolute address of storage list information block + * @sla: absolute address of storage list + * @slsba: absolute address of storage list state block * @akey: access key for SLIB * @bkey: access key for SL * @ckey: access key for SBALs @@ -56,7 +56,7 @@ struct qdesfmt0 { * @oqdcnt: output queue descriptor count * @iqdsz: input queue descriptor size * @oqdsz: output queue descriptor size - * @qiba: queue information block address + * @qiba: absolute address of queue information block * @qkey: queue information block key * @qdf0: queue descriptions */ @@ -327,7 +327,6 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int, * struct qdio_initialize - qdio initialization data * @q_format: queue format * @qdr_ac: feature flags to set - * @adapter_name: name for the adapter * @qib_param_field_format: format for qib_parm_field * @qib_param_field: pointer to 128 bytes or NULL, if no param field * @qib_rflags: rflags to set @@ -347,7 +346,6 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int, struct qdio_initialize { unsigned char q_format; unsigned char qdr_ac; - unsigned char adapter_name[8]; unsigned int qib_param_field_format; unsigned char *qib_param_field; unsigned char qib_rflags; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index c563f8368b19..a7bdd128d85b 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -114,8 +114,7 @@ int sclp_early_get_core_info(struct sclp_core_info *info); void sclp_early_get_ipl_info(struct sclp_ipl_info *info); void sclp_early_detect(void); void sclp_early_printk(const char *s); -void sclp_early_printk_force(const char *s); -void __sclp_early_printk(const char *s, unsigned int len, unsigned int force); +void __sclp_early_printk(const char *s, unsigned int len); int sclp_early_get_memsize(unsigned long *mem); int sclp_early_get_hsa_size(unsigned long *hsa_size); @@ -129,6 +128,8 @@ int sclp_chp_deconfigure(struct chp_id chpid); int sclp_chp_read_info(struct sclp_chp_info *info); int sclp_pci_configure(u32 fid); int sclp_pci_deconfigure(u32 fid); +int sclp_ap_configure(u32 apid); +int sclp_ap_deconfigure(u32 apid); int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid); int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count); int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count); diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h index c59a83536c70..a22a5a81811c 100644 --- a/arch/s390/include/asm/set_memory.h +++ b/arch/s390/include/asm/set_memory.h @@ -2,6 +2,10 @@ #ifndef _ASMS390_SET_MEMORY_H #define _ASMS390_SET_MEMORY_H +#include <linux/mutex.h> + +extern struct mutex cpa_mutex; + #define SET_MEMORY_RO 1UL #define SET_MEMORY_RW 2UL #define SET_MEMORY_NX 4UL diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 534f212753d6..bdb242a1544e 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -92,7 +92,9 @@ extern int memory_end_set; extern unsigned long memory_end; extern unsigned long vmalloc_size; extern unsigned long max_physmem_end; -extern unsigned long __swsusp_reset_dma; + +/* The Write Back bit position in the physaddr is given by the SLPC PCI */ +extern unsigned long mio_wb_bit_mask; #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) @@ -119,9 +121,6 @@ extern unsigned int console_mode; extern unsigned int console_devno; extern unsigned int console_irq; -extern char vmhalt_cmd[]; -extern char vmpoff_cmd[]; - #define CONSOLE_IS_UNDEFINED (console_mode == 0) #define CONSOLE_IS_SCLP (console_mode == 1) #define CONSOLE_IS_3215 (console_mode == 2) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7e155fb6c254..01e360004481 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -31,7 +31,6 @@ extern void smp_emergency_stop(void); extern int smp_find_processor_id(u16 address); extern int smp_store_status(int cpu); extern void smp_save_dump_cpus(void); -extern int smp_vcpu_scheduled(int cpu); extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h index f0ddefb06ec8..ba07463897c1 100644 --- a/arch/s390/include/asm/stp.h +++ b/arch/s390/include/asm/stp.h @@ -6,43 +6,89 @@ #ifndef __S390_STP_H #define __S390_STP_H +#include <linux/compiler.h> + /* notifier for syncs */ extern struct atomic_notifier_head s390_epoch_delta_notifier; /* STP interruption parameter */ struct stp_irq_parm { - unsigned int _pad0 : 14; - unsigned int tsc : 1; /* Timing status change */ - unsigned int lac : 1; /* Link availability change */ - unsigned int tcpc : 1; /* Time control parameter change */ - unsigned int _pad2 : 15; -} __attribute__ ((packed)); + u32 : 14; + u32 tsc : 1; /* Timing status change */ + u32 lac : 1; /* Link availability change */ + u32 tcpc : 1; /* Time control parameter change */ + u32 : 15; +} __packed; #define STP_OP_SYNC 1 #define STP_OP_CTRL 3 struct stp_sstpi { - unsigned int rsvd0; - unsigned int rsvd1 : 8; - unsigned int stratum : 8; - unsigned int vbits : 16; - unsigned int leaps : 16; - unsigned int tmd : 4; - unsigned int ctn : 4; - unsigned int rsvd2 : 3; - unsigned int c : 1; - unsigned int tst : 4; - unsigned int tzo : 16; - unsigned int dsto : 16; - unsigned int ctrl : 16; - unsigned int rsvd3 : 16; - unsigned int tto; - unsigned int rsvd4; - unsigned int ctnid[3]; - unsigned int rsvd5; - unsigned int todoff[4]; - unsigned int rsvd6[48]; -} __attribute__ ((packed)); + u32 : 32; + u32 tu : 1; + u32 lu : 1; + u32 : 6; + u32 stratum : 8; + u32 vbits : 16; + u32 leaps : 16; + u32 tmd : 4; + u32 ctn : 4; + u32 : 3; + u32 c : 1; + u32 tst : 4; + u32 tzo : 16; + u32 dsto : 16; + u32 ctrl : 16; + u32 : 16; + u32 tto; + u32 : 32; + u32 ctnid[3]; + u32 : 32; + u32 todoff[4]; + u32 rsvd[48]; +} __packed; + +struct stp_tzib { + u32 tzan : 16; + u32 : 16; + u32 tzo : 16; + u32 dsto : 16; + u32 stn; + u32 dstn; + u64 dst_on_alg; + u64 dst_off_alg; +} __packed; + +struct stp_tcpib { + u32 atcode : 4; + u32 ntcode : 4; + u32 d : 1; + u32 : 23; + s32 tto; + struct stp_tzib atzib; + struct stp_tzib ntzib; + s32 adst_offset : 16; + s32 ndst_offset : 16; + u32 rsvd1; + u64 ntzib_update; + u64 ndsto_update; +} __packed; + +struct stp_lsoib { + u32 p : 1; + u32 : 31; + s32 also : 16; + s32 nlso : 16; + u64 nlsout; +} __packed; + +struct stp_stzi { + u32 rsvd0[3]; + u64 data_ts; + u32 rsvd1[22]; + struct stp_tcpib tcpib; + struct stp_lsoib lsoib; +} __packed; /* Functions needed by the machine check handler */ int stp_sync_check(void); diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index acce6a08a1fa..6448bb5be10c 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -30,8 +30,6 @@ static inline void __tlb_flush_idte(unsigned long asce) : : "a" (opt), "a" (asce) : "cc"); } -void smp_ptlb_all(void); - /* * Flush all TLB entries on all CPUs. */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index f09444d6aeab..c868e7ee49b3 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -60,6 +60,9 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); #define INLINE_COPY_TO_USER #endif +int __put_user_bad(void) __attribute__((noreturn)); +int __get_user_bad(void) __attribute__((noreturn)); + #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES #define __put_get_user_asm(to, from, size, spec) \ @@ -109,6 +112,9 @@ static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned lon (unsigned long *)x, size, spec); break; + default: + __put_user_bad(); + break; } return rc; } @@ -139,6 +145,9 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign (unsigned long __user *)ptr, size, spec); break; + default: + __get_user_bad(); + break; } return rc; } @@ -179,7 +188,7 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s default: \ __put_user_bad(); \ break; \ - } \ + } \ __builtin_expect(__pu_err, 0); \ }) @@ -190,8 +199,6 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s }) -int __put_user_bad(void) __attribute__((noreturn)); - #define __get_user(x, ptr) \ ({ \ int __gu_err = -EFAULT; \ @@ -238,8 +245,6 @@ int __put_user_bad(void) __attribute__((noreturn)); __get_user(x, ptr); \ }) -int __get_user_bad(void) __attribute__((noreturn)); - unsigned long __must_check raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); @@ -278,4 +283,115 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo int copy_to_user_real(void __user *dest, void *src, unsigned long count); void *s390_kernel_write(void *dst, const void *src, size_t size); +#define HAVE_GET_KERNEL_NOFAULT + +int __noreturn __put_kernel_bad(void); + +#define __put_kernel_asm(val, to, insn) \ +({ \ + int __rc; \ + \ + asm volatile( \ + "0: " insn " %2,%1\n" \ + "1: xr %0,%0\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: lhi %0,%3\n" \ + " jg 2b\n" \ + ".popsection\n" \ + EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ + : "=d" (__rc), "+Q" (*(to)) \ + : "d" (val), "K" (-EFAULT) \ + : "cc"); \ + __rc; \ +}) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + u64 __x = (u64)(*((type *)(src))); \ + int __pk_err; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \ + break; \ + case 2: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \ + break; \ + case 4: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \ + break; \ + case 8: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \ + break; \ + default: \ + __pk_err = __put_kernel_bad(); \ + break; \ + } \ + if (unlikely(__pk_err)) \ + goto err_label; \ +} while (0) + +int __noreturn __get_kernel_bad(void); + +#define __get_kernel_asm(val, from, insn) \ +({ \ + int __rc; \ + \ + asm volatile( \ + "0: " insn " %1,%2\n" \ + "1: xr %0,%0\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: lhi %0,%3\n" \ + " jg 2b\n" \ + ".popsection\n" \ + EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ + : "=d" (__rc), "+d" (val) \ + : "Q" (*(from)), "K" (-EFAULT) \ + : "cc"); \ + __rc; \ +}) + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __gk_err; \ + \ + switch (sizeof(type)) { \ + case 1: { \ + u8 __x = 0; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 2: { \ + u16 __x = 0; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 4: { \ + u32 __x = 0; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 8: { \ + u64 __x = 0; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + default: \ + __gk_err = __get_kernel_bad(); \ + break; \ + } \ + if (unlikely(__gk_err)) \ + goto err_label; \ +} while (0) + #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index cff4b4c99b75..0325fc0469b7 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -33,6 +33,7 @@ #define UVC_CMD_DESTROY_SEC_CPU 0x0121 #define UVC_CMD_CONV_TO_SEC_STOR 0x0200 #define UVC_CMD_CONV_FROM_SEC_STOR 0x0201 +#define UVC_CMD_DESTR_SEC_STOR 0x0202 #define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 #define UVC_CMD_UNPACK_IMG 0x0301 #define UVC_CMD_VERIFY_IMG 0x0302 @@ -344,6 +345,7 @@ static inline int is_prot_virt_host(void) } int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int uv_destroy_page(unsigned long paddr); int uv_convert_from_secure(unsigned long paddr); int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); @@ -354,6 +356,11 @@ void adjust_to_uv_max(unsigned long *vmax); static inline void setup_uv(void) {} static inline void adjust_to_uv_max(unsigned long *vmax) {} +static inline int uv_destroy_page(unsigned long paddr) +{ + return 0; +} + static inline int uv_convert_from_secure(unsigned long paddr) { return 0; diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 0cd085cdeb4f..29b44a930e71 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -2,6 +2,8 @@ #ifndef __S390_VDSO_H__ #define __S390_VDSO_H__ +#include <vdso/datapage.h> + /* Default link addresses for the vDSOs */ #define VDSO32_LBASE 0 #define VDSO64_LBASE 0 @@ -18,30 +20,7 @@ * itself and may change without notice. */ -struct vdso_data { - __u64 tb_update_count; /* Timebase atomicity ctr 0x00 */ - __u64 xtime_tod_stamp; /* TOD clock for xtime 0x08 */ - __u64 xtime_clock_sec; /* Kernel time 0x10 */ - __u64 xtime_clock_nsec; /* 0x18 */ - __u64 xtime_coarse_sec; /* Coarse kernel time 0x20 */ - __u64 xtime_coarse_nsec; /* 0x28 */ - __u64 wtom_clock_sec; /* Wall to monotonic clock 0x30 */ - __u64 wtom_clock_nsec; /* 0x38 */ - __u64 wtom_coarse_sec; /* Coarse wall to monotonic 0x40 */ - __u64 wtom_coarse_nsec; /* 0x48 */ - __u32 tz_minuteswest; /* Minutes west of Greenwich 0x50 */ - __u32 tz_dsttime; /* Type of dst correction 0x54 */ - __u32 ectg_available; /* ECTG instruction present 0x58 */ - __u32 tk_mult; /* Mult. used for xtime_nsec 0x5c */ - __u32 tk_shift; /* Shift used for xtime_nsec 0x60 */ - __u32 ts_dir; /* TOD steering direction 0x64 */ - __u64 ts_end; /* TOD steering end 0x68 */ - __u32 hrtimer_res; /* hrtimer resolution 0x70 */ -}; - struct vdso_per_cpu_data { - __u64 ectg_timer_base; - __u64 ectg_user_time; /* * Note: node_id and cpu_nr must be at adjacent memory locations. * VDSO userspace must read both values with a single instruction. @@ -56,9 +35,7 @@ struct vdso_per_cpu_data { }; extern struct vdso_data *vdso_data; -extern struct vdso_data boot_vdso_data; -void vdso_alloc_boot_cpu(struct lowcore *lowcore); int vdso_alloc_per_cpu(struct lowcore *lowcore); void vdso_free_per_cpu(struct lowcore *lowcore); diff --git a/arch/s390/include/asm/vdso/clocksource.h b/arch/s390/include/asm/vdso/clocksource.h new file mode 100644 index 000000000000..a93eda0ce7bb --- /dev/null +++ b/arch/s390/include/asm/vdso/clocksource.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_CLOCKSOURCE_H +#define __ASM_VDSO_CLOCKSOURCE_H + +#define VDSO_ARCH_CLOCKMODES \ + VDSO_CLOCKMODE_TOD + +#endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h new file mode 100644 index 000000000000..7b3cdb4a5f48 --- /dev/null +++ b/arch/s390/include/asm/vdso/data.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_ASM_VDSO_DATA_H +#define __S390_ASM_VDSO_DATA_H + +#include <linux/types.h> +#include <vdso/datapage.h> + +struct arch_vdso_data { + __u64 tod_steering_delta; + __u64 tod_steering_end; +}; + +#endif /* __S390_ASM_VDSO_DATA_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h new file mode 100644 index 000000000000..bf123065ad3b --- /dev/null +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_VDSO_GETTIMEOFDAY_H +#define ASM_VDSO_GETTIMEOFDAY_H + +#define VDSO_HAS_TIME 1 + +#define VDSO_HAS_CLOCK_GETRES 1 + +#include <asm/timex.h> +#include <asm/unistd.h> +#include <asm/vdso.h> +#include <linux/compiler.h> + +#define vdso_calc_delta __arch_vdso_calc_delta +static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +{ + return (cycles - last) * mult; +} + +static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +{ + return _vdso_data; +} + +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) +{ + const struct vdso_data *vdso = __arch_get_vdso_data(); + u64 adj, now; + + now = get_tod_clock(); + adj = vdso->arch_data.tod_steering_end - now; + if (unlikely((s64) adj > 0)) + now += (vdso->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + return now; +} + +static __always_inline +long clock_gettime_fallback(clockid_t clkid, struct __kernel_timespec *ts) +{ + register unsigned long r1 __asm__("r1") = __NR_clock_gettime; + register unsigned long r2 __asm__("r2") = (unsigned long)clkid; + register void *r3 __asm__("r3") = ts; + + asm ("svc 0\n" : "+d" (r2) : "d" (r1), "d" (r3) : "cc", "memory"); + return r2; +} + +static __always_inline +long gettimeofday_fallback(register struct __kernel_old_timeval *tv, + register struct timezone *tz) +{ + register unsigned long r1 __asm__("r1") = __NR_gettimeofday; + register unsigned long r2 __asm__("r2") = (unsigned long)tv; + register void *r3 __asm__("r3") = tz; + + asm ("svc 0\n" : "+d" (r2) : "d" (r1), "d" (r3) : "cc", "memory"); + return r2; +} + +static __always_inline +long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) +{ + register unsigned long r1 __asm__("r1") = __NR_clock_getres; + register unsigned long r2 __asm__("r2") = (unsigned long)clkid; + register void *r3 __asm__("r3") = ts; + + asm ("svc 0\n" : "+d" (r2) : "d" (r1), "d" (r3) : "cc", "memory"); + return r2; +} + +#endif diff --git a/arch/s390/include/asm/vdso/processor.h b/arch/s390/include/asm/vdso/processor.h new file mode 100644 index 000000000000..cfcc3e117c4c --- /dev/null +++ b/arch/s390/include/asm/vdso/processor.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_VDSO_PROCESSOR_H +#define __ASM_VDSO_PROCESSOR_H + +#define cpu_relax() barrier() + +#endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/s390/include/asm/vdso/vdso.h b/arch/s390/include/asm/vdso/vdso.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/arch/s390/include/asm/vdso/vdso.h diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h new file mode 100644 index 000000000000..6c67c08cefdd --- /dev/null +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_VSYSCALL_H +#define __ASM_VDSO_VSYSCALL_H + +#ifndef __ASSEMBLY__ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <asm/vdso.h> +/* + * Update the vDSO data page to keep in sync with kernel timekeeping. + */ + +static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) +{ + return vdso_data; +} +#define __arch_get_k_vdso_data __s390_get_k_vdso_data + +/* The asm-generic header needs to be included after the definitions above */ +#include <asm-generic/vdso/vsyscall.h> + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/s390/include/asm/vtimer.h b/arch/s390/include/asm/vtimer.h index 42f707d1c1e8..e601adaa6320 100644 --- a/arch/s390/include/asm/vtimer.h +++ b/arch/s390/include/asm/vtimer.h @@ -25,8 +25,6 @@ extern void add_virt_timer_periodic(struct vtimer_list *timer); extern int mod_virt_timer(struct vtimer_list *timer, u64 expires); extern int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires); extern int del_virt_timer(struct vtimer_list *timer); - -extern void init_cpu_vtimer(void); extern void vtime_init(void); #endif /* _ASM_S390_TIMER_H */ diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h index d27d7d329263..7349e96d28a0 100644 --- a/arch/s390/include/uapi/asm/pkey.h +++ b/arch/s390/include/uapi/asm/pkey.h @@ -35,12 +35,16 @@ #define PKEY_KEYTYPE_AES_128 1 #define PKEY_KEYTYPE_AES_192 2 #define PKEY_KEYTYPE_AES_256 3 +#define PKEY_KEYTYPE_ECC 4 /* the newer ioctls use a pkey_key_type enum for type information */ enum pkey_key_type { PKEY_TYPE_CCA_DATA = (__u32) 1, PKEY_TYPE_CCA_CIPHER = (__u32) 2, PKEY_TYPE_EP11 = (__u32) 3, + PKEY_TYPE_CCA_ECC = (__u32) 0x1f, + PKEY_TYPE_EP11_AES = (__u32) 6, + PKEY_TYPE_EP11_ECC = (__u32) 7, }; /* the newer ioctls use a pkey_key_size enum for key size information */ @@ -89,6 +93,20 @@ struct pkey_clrkey { }; /* + * EP11 key blobs of type PKEY_TYPE_EP11_AES and PKEY_TYPE_EP11_ECC + * are ep11 blobs prepended by this header: + */ +struct ep11kblob_header { + __u8 type; /* always 0x00 */ + __u8 hver; /* header version, currently needs to be 0x00 */ + __u16 len; /* total length in bytes (including this header) */ + __u8 version; /* PKEY_TYPE_EP11_AES or PKEY_TYPE_EP11_ECC */ + __u8 res0; /* unused */ + __u16 bitlen; /* clear key bit len, 0 for unknown */ + __u8 res1[8]; /* unused */ +} __packed; + +/* * Generate CCA AES secure key. */ struct pkey_genseck { @@ -304,7 +322,7 @@ struct pkey_verifykey2 { #define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2) /* - * Transform a key blob (of any type) into a protected key, version 2. + * Transform a key blob into a protected key, version 2. * There needs to be a list of apqns given with at least one entry in there. * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain * is not supported. The implementation walks through the list of apqns and @@ -313,6 +331,8 @@ struct pkey_verifykey2 { * list is tried until success (return 0) or the end of the list is reached * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to * generate a list of apqns based on the key. + * Deriving ECC protected keys from ECC secure keys is not supported with + * this ioctl, use PKEY_KBLOB2PROTK3 for this purpose. */ struct pkey_kblob2pkey2 { __u8 __user *key; /* in: pointer to key blob */ @@ -326,17 +346,17 @@ struct pkey_kblob2pkey2 { /* * Build a list of APQNs based on a key blob given. * Is able to find out which type of secure key is given (CCA AES secure - * key, CCA AES cipher key or EP11 AES key) and tries to find all matching - * crypto cards based on the MKVP and maybe other criterias (like CCA AES - * cipher keys need a CEX5C or higher, EP11 keys with BLOB_PKEY_EXTRACTABLE - * need a CEX7 and EP11 api version 4). The list of APQNs is further filtered - * by the key's mkvp which needs to match to either the current mkvp (CCA and - * EP11) or the alternate mkvp (old mkvp, CCA adapters only) of the apqns. The - * flags argument may be used to limit the matching apqns. If the - * PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of each apqn is - * compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both are given, it - * is assumed to return apqns where either the current or the alternate mkvp - * matches. At least one of the matching flags needs to be given. + * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private + * key) and tries to find all matching crypto cards based on the MKVP and maybe + * other criterias (like CCA AES cipher keys need a CEX5C or higher, EP11 keys + * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of + * APQNs is further filtered by the key's mkvp which needs to match to either + * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters + * only) of the apqns. The flags argument may be used to limit the matching + * apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of + * each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both + * are given, it is assumed to return apqns where either the current or the + * alternate mkvp matches. At least one of the matching flags needs to be given. * The flags argument for EP11 keys has no further action and is currently * ignored (but needs to be given as PKEY_FLAGS_MATCH_CUR_MKVP) as there is only * the wkvp from the key to match against the apqn's wkvp. @@ -365,9 +385,10 @@ struct pkey_apqns4key { * restrict the list by given master key verification patterns. * For different key types there may be different ways to match the * master key verification patterns. For CCA keys (CCA data key and CCA - * cipher key) the first 8 bytes of cur_mkvp refer to the current mkvp value - * of the apqn and the first 8 bytes of the alt_mkvp refer to the old mkvp. - * The flags argument controls if the apqns current and/or alternate mkvp + * cipher key) the first 8 bytes of cur_mkvp refer to the current AES mkvp value + * of the apqn and the first 8 bytes of the alt_mkvp refer to the old AES mkvp. + * For CCA ECC keys it is similar but the match is against the APKA current/old + * mkvp. The flags argument controls if the apqns current and/or alternate mkvp * should match. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current * mkvp of each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. * If both are given, it is assumed to return apqns where either the @@ -397,4 +418,30 @@ struct pkey_apqns4keytype { }; #define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype) +/* + * Transform a key blob into a protected key, version 3. + * The difference to version 2 of this ioctl is that the protected key + * buffer is now explicitly and not within a struct pkey_protkey any more. + * So this ioctl is also able to handle EP11 and CCA ECC secure keys and + * provide ECC protected keys. + * There needs to be a list of apqns given with at least one entry in there. + * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain + * is not supported. The implementation walks through the list of apqns and + * tries to send the request to each apqn without any further checking (like + * card type or online state). If the apqn fails, simple the next one in the + * list is tried until success (return 0) or the end of the list is reached + * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to + * generate a list of apqns based on the key. + */ +struct pkey_kblob2pkey3 { + __u8 __user *key; /* in: pointer to key blob */ + __u32 keylen; /* in: key blob size */ + struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */ + __u32 apqn_entries; /* in: # of apqn target list entries */ + __u32 pkeytype; /* out: prot key type (enum pkey_key_type) */ + __u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */ + __u8 __user *pkey; /* in: pkey blob buffer space ptr */ +}; +#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3) + #endif /* _UAPI_PKEY_H */ diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h index 6ca1e68d7103..ede318653c87 100644 --- a/arch/s390/include/uapi/asm/sie.h +++ b/arch/s390/include/uapi/asm/sie.h @@ -29,7 +29,7 @@ { 0x13, "SIGP conditional emergency signal" }, \ { 0x15, "SIGP sense running" }, \ { 0x16, "SIGP set multithreading"}, \ - { 0x17, "SIGP store additional status ait address"} + { 0x17, "SIGP store additional status at address"} #define icpt_prog_codes \ { 0x0001, "Prog Operation" }, \ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index efca70970761..dd73b7f07423 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KPROBES) += kprobes_insn_page.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 5d8cc1864566..ece58f2217cb 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -59,26 +59,6 @@ int main(void) OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]); OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]); BLANK(); - /* timeval/timezone offsets for use by vdso */ - OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count); - OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp); - OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec); - OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec); - OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec); - OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec); - OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec); - OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec); - OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec); - OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec); - OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest); - OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available); - OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult); - OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift); - OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir); - OFFSET(__VDSO_TS_END, vdso_data, ts_end); - OFFSET(__VDSO_CLOCK_REALTIME_RES, vdso_data, hrtimer_res); - OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base); - OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time); OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val); BLANK(); /* constants used by the vdso */ diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index c42ce348103c..205b2e2648aa 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -141,7 +141,7 @@ int copy_oldmem_kernel(void *dst, void *src, size_t count) while (count) { from = __pa(src); if (!OLDMEM_BASE && from < sclp.hsa_size) { - /* Copy from zfcpdump HSA area */ + /* Copy from zfcp/nvme dump HSA area */ len = min(count, sclp.hsa_size - from); rc = memcpy_hsa_kernel(dst, from, len); if (rc) @@ -184,7 +184,7 @@ static int copy_oldmem_user(void __user *dst, void *src, size_t count) while (count) { from = __pa(src); if (!OLDMEM_BASE && from < sclp.hsa_size) { - /* Copy from zfcpdump HSA area */ + /* Copy from zfcp/nvme dump HSA area */ len = min(count, sclp.hsa_size - from); rc = memcpy_hsa_user(dst, from, len); if (rc) @@ -258,7 +258,7 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, } /* - * Remap "oldmem" for zfcpdump + * Remap "oldmem" for zfcp/nvme dump * * We only map available memory above HSA size. Memory below HSA size * is read on demand using the copy_oldmem_page() function. @@ -283,7 +283,7 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, } /* - * Remap "oldmem" for kdump or zfcpdump + * Remap "oldmem" for kdump or zfcp/nvme dump */ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) @@ -632,11 +632,11 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) u32 alloc_size; u64 hdr_off; - /* If we are not in kdump or zfcpdump mode return */ - if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP) + /* If we are not in kdump or zfcp/nvme dump mode return */ + if (!OLDMEM_BASE && !is_ipl_type_dump()) return 0; - /* If we cannot get HSA size for zfcpdump return error */ - if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size) + /* If we cannot get HSA size for zfcp/nvme dump return error */ + if (is_ipl_type_dump() && !sclp.hsa_size) return -ENODEV; /* For kdump, exclude previous crashkernel memory */ diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index ccba63aaeb47..b8b0cd7b008f 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -104,18 +104,7 @@ static const struct seq_operations show_diag_stat_sops = { .show = show_diag_stat, }; -static int show_diag_stat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &show_diag_stat_sops); -} - -static const struct file_operations show_diag_stat_fops = { - .open = show_diag_stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - +DEFINE_SEQ_ATTRIBUTE(show_diag_stat); static int __init show_diag_stat_init(void) { diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index f304802ecf7b..a7eab7be4db0 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -482,31 +482,37 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) return (int) (ptr - buffer); } +static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len) +{ + if (user_mode(regs)) { + if (copy_from_user(dst, (char __user *)src, len)) + return -EFAULT; + } else { + if (copy_from_kernel_nofault(dst, src, len)) + return -EFAULT; + } + return 0; +} + void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; unsigned char code[64]; char buffer[128], *ptr; - mm_segment_t old_fs; unsigned long addr; int start, end, opsize, hops, i; /* Get a snapshot of the 64 bytes surrounding the fault address. */ - old_fs = get_fs(); - set_fs(user_mode(regs) ? USER_DS : KERNEL_DS); for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { addr = regs->psw.addr - 34 + start; - if (__copy_from_user(code + start - 2, - (char __user *) addr, 2)) + if (copy_from_regs(regs, code + start - 2, (void *)addr, 2)) break; } for (end = 32; end < 64; end += 2) { addr = regs->psw.addr + end - 32; - if (__copy_from_user(code + end, - (char __user *) addr, 2)) + if (copy_from_regs(regs, code + end, (void *)addr, 2)) break; } - set_fs(old_fs); /* Code snapshot useable ? */ if ((regs->psw.addr & 1) || start >= end) { printk("%s Code: Bad PSW.\n", mode); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 078277231858..705844f73934 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -274,19 +274,6 @@ static int __init disable_vector_extension(char *str) } early_param("novx", disable_vector_extension); -static int __init cad_setup(char *str) -{ - bool enabled; - int rc; - - rc = kstrtobool(str, &enabled); - if (!rc && enabled && test_facility(128)) - /* Enable problem state CAD. */ - __ctl_set_bit(2, 3); - return rc; -} -early_param("cad", cad_setup); - char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c index 6f24d83bc5dc..d9d53f44008a 100644 --- a/arch/s390/kernel/early_printk.c +++ b/arch/s390/kernel/early_printk.c @@ -10,7 +10,7 @@ static void sclp_early_write(struct console *con, const char *s, unsigned int len) { - __sclp_early_printk(s, len, 0); + __sclp_early_printk(s, len); } static struct console sclp_early_console = { diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 23edf196d3dc..86235919c2d1 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -435,10 +435,8 @@ ENTRY(system_call) jz .Lsysc_skip_fpu brasl %r14,load_fpu_regs .Lsysc_skip_fpu: - lg %r14,__LC_VDSO_PER_CPU mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) stpt __LC_EXIT_TIMER - mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r0,%r15,__PT_R0(%r11) b __LC_RETURN_LPSWE @@ -797,13 +795,11 @@ ENTRY(io_int_handler) TRACE_IRQS_ON 0: #endif - lg %r14,__LC_VDSO_PER_CPU mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) tm __PT_PSW+1(%r11),0x01 # returning to user ? jno .Lio_exit_kernel BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER - mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER .Lio_exit_kernel: lmg %r0,%r15,__PT_R0(%r11) b __LC_RETURN_LPSWE @@ -1213,14 +1209,12 @@ ENTRY(mcck_int_handler) brasl %r14,s390_handle_mcck TRACE_IRQS_ON .Lmcck_return: - lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? jno 0f BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER - mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER 0: lmg %r11,%r15,__PT_R11(%r11) b __LC_RETURN_MCCK_LPSWE diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index a44ddc2f2dec..d2ca3fe51f8e 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -9,7 +9,6 @@ #include <asm/idle.h> extern void *restart_stack; -extern unsigned long suspend_zero_pages; void system_call(void); void pgm_check_handler(void); @@ -17,7 +16,6 @@ void ext_int_handler(void); void io_int_handler(void); void mcck_int_handler(void); void restart_int_handler(void); -void restart_call_handler(void); asmlinkage long do_syscall_trace_enter(struct pt_regs *regs); asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); @@ -62,12 +60,10 @@ void do_notify_resume(struct pt_regs *regs); void __init init_IRQ(void); void do_IRQ(struct pt_regs *regs, int irq); void do_restart(void); -void __init startup_init_nobss(void); void __init startup_init(void); void die(struct pt_regs *regs, const char *str); int setup_profiling_timer(unsigned int multiplier); void __init time_init(void); -void s390_early_resume(void); unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; @@ -92,4 +88,6 @@ void set_fs_fixup(void); unsigned long stack_alloc(void); void stack_free(unsigned long stack); +extern char kprobes_insn_page[]; + #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 90a2a17239b0..98b3aca1de8e 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -40,10 +40,12 @@ #define IPL_FCP_STR "fcp" #define IPL_FCP_DUMP_STR "fcp_dump" #define IPL_NVME_STR "nvme" +#define IPL_NVME_DUMP_STR "nvme_dump" #define IPL_NSS_STR "nss" #define DUMP_CCW_STR "ccw" #define DUMP_FCP_STR "fcp" +#define DUMP_NVME_STR "nvme" #define DUMP_NONE_STR "none" /* @@ -96,6 +98,8 @@ static char *ipl_type_str(enum ipl_type type) return IPL_NSS_STR; case IPL_TYPE_NVME: return IPL_NVME_STR; + case IPL_TYPE_NVME_DUMP: + return IPL_NVME_DUMP_STR; case IPL_TYPE_UNKNOWN: default: return IPL_UNKNOWN_STR; @@ -106,6 +110,7 @@ enum dump_type { DUMP_TYPE_NONE = 1, DUMP_TYPE_CCW = 2, DUMP_TYPE_FCP = 4, + DUMP_TYPE_NVME = 8, }; static char *dump_type_str(enum dump_type type) @@ -117,6 +122,8 @@ static char *dump_type_str(enum dump_type type) return DUMP_CCW_STR; case DUMP_TYPE_FCP: return DUMP_FCP_STR; + case DUMP_TYPE_NVME: + return DUMP_NVME_STR; default: return NULL; } @@ -144,10 +151,12 @@ static struct ipl_parameter_block *reipl_block_actual; static int dump_capabilities = DUMP_TYPE_NONE; static enum dump_type dump_type = DUMP_TYPE_NONE; static struct ipl_parameter_block *dump_block_fcp; +static struct ipl_parameter_block *dump_block_nvme; static struct ipl_parameter_block *dump_block_ccw; static struct sclp_ipl_info sclp_ipl_info; +static bool reipl_nvme_clear; static bool reipl_fcp_clear; static bool reipl_ccw_clear; @@ -266,7 +275,10 @@ static __init enum ipl_type get_ipl_type(void) else return IPL_TYPE_FCP; case IPL_PBT_NVME: - return IPL_TYPE_NVME; + if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) + return IPL_TYPE_NVME_DUMP; + else + return IPL_TYPE_NVME; } return IPL_TYPE_UNKNOWN; } @@ -324,6 +336,7 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, case IPL_TYPE_FCP_DUMP: return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: return sprintf(page, "%08ux\n", ipl_block.nvme.fid); default: return 0; @@ -531,6 +544,7 @@ static int __init ipl_init(void) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); break; case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); break; default: @@ -873,6 +887,24 @@ static struct attribute_group reipl_nvme_attr_group = { .bin_attrs = reipl_nvme_bin_attrs }; +static ssize_t reipl_nvme_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_nvme_clear); +} + +static ssize_t reipl_nvme_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (strtobool(buf, &reipl_nvme_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_nvme_clear_attr = + __ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store); + /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); @@ -1099,7 +1131,10 @@ static void __reipl_run(void *unused) break; case IPL_TYPE_NVME: diag308(DIAG308_SET, reipl_block_nvme); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_nvme_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); break; case IPL_TYPE_NSS: diag308(DIAG308_SET, reipl_block_nss); @@ -1109,6 +1144,7 @@ static void __reipl_run(void *unused) diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_FCP_DUMP: + case IPL_TYPE_NVME_DUMP: break; } disabled_wait(); @@ -1219,8 +1255,9 @@ static int __init reipl_fcp_init(void) &sys_reipl_fcp_clear_attr.attr); if (rc) goto out2; - } else + } else { reipl_fcp_clear = true; + } if (ipl_info.type == IPL_TYPE_FCP) { memcpy(reipl_block_fcp, &ipl_block, sizeof(ipl_block)); @@ -1266,10 +1303,16 @@ static int __init reipl_nvme_init(void) } rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); - if (rc) { - kset_unregister(reipl_nvme_kset); - free_page((unsigned long) reipl_block_nvme); - return rc; + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_nvme_kset->kobj, + &sys_reipl_nvme_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_nvme_clear = true; } if (ipl_info.type == IPL_TYPE_NVME) { @@ -1290,6 +1333,13 @@ static int __init reipl_nvme_init(void) } reipl_capabilities |= IPL_TYPE_NVME; return 0; + +out2: + sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); +out1: + kset_unregister(reipl_nvme_kset); + free_page((unsigned long) reipl_block_nvme); + return rc; } static int __init reipl_type_init(void) @@ -1382,6 +1432,29 @@ static struct attribute_group dump_fcp_attr_group = { .attrs = dump_fcp_attrs, }; +/* NVME dump device attributes */ +DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n", + dump_block_nvme->nvme.fid); +DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n", + dump_block_nvme->nvme.nsid); +DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", + dump_block_nvme->nvme.bootprog); +DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", + dump_block_nvme->nvme.br_lba); + +static struct attribute *dump_nvme_attrs[] = { + &sys_dump_nvme_fid_attr.attr, + &sys_dump_nvme_nsid_attr.attr, + &sys_dump_nvme_bootprog_attr.attr, + &sys_dump_nvme_br_lba_attr.attr, + NULL, +}; + +static struct attribute_group dump_nvme_attr_group = { + .name = IPL_NVME_STR, + .attrs = dump_nvme_attrs, +}; + /* CCW dump device attributes */ DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw); @@ -1423,6 +1496,8 @@ static ssize_t dump_type_store(struct kobject *kobj, rc = dump_set_type(DUMP_TYPE_CCW); else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0) rc = dump_set_type(DUMP_TYPE_FCP); + else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_NVME); return (rc != 0) ? rc : len; } @@ -1450,6 +1525,9 @@ static void __dump_run(void *unused) case DUMP_TYPE_FCP: diag308_dump(dump_block_fcp); break; + case DUMP_TYPE_NVME: + diag308_dump(dump_block_nvme); + break; default: break; } @@ -1506,6 +1584,29 @@ static int __init dump_fcp_init(void) return 0; } +static int __init dump_nvme_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump) + return 0; /* LDIPL DUMP is not installed */ + dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_nvme) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group); + if (rc) { + free_page((unsigned long)dump_block_nvme); + return rc; + } + dump_block_nvme->hdr.len = IPL_BP_NVME_LEN; + dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN; + dump_block_nvme->fcp.pbt = IPL_PBT_NVME; + dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_NVME; + return 0; +} + static int __init dump_init(void) { int rc; @@ -1524,6 +1625,9 @@ static int __init dump_init(void) rc = dump_fcp_init(); if (rc) return rc; + rc = dump_nvme_init(); + if (rc) + return rc; dump_set_type(DUMP_TYPE_NONE); return 0; } @@ -1956,6 +2060,7 @@ void __init setup_ipl(void) ipl_info.data.fcp.lun = ipl_block.fcp.lun; break; case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: ipl_info.data.nvme.fid = ipl_block.nvme.fid; ipl_info.data.nvme.nsid = ipl_block.nvme.nsid; break; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index fc30e799bd84..aae24dc75df6 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -7,6 +7,7 @@ * s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com> */ +#include <linux/moduleloader.h> #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/preempt.h> @@ -21,6 +22,7 @@ #include <asm/set_memory.h> #include <asm/sections.h> #include <asm/dis.h> +#include "entry.h" DEFINE_PER_CPU(struct kprobe *, current_kprobe); DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -30,19 +32,32 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; DEFINE_INSN_CACHE_OPS(s390_insn); static int insn_page_in_use; -static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (!page) + return NULL; + __set_memory((unsigned long) page, 1, SET_MEMORY_RO | SET_MEMORY_X); + return page; +} + +void free_insn_page(void *page) +{ + module_memfree(page); +} static void *alloc_s390_insn_page(void) { if (xchg(&insn_page_in_use, 1) == 1) return NULL; - set_memory_x((unsigned long) &insn_page, 1); - return &insn_page; + return &kprobes_insn_page; } static void free_s390_insn_page(void *page) { - set_memory_nx((unsigned long) page, 1); xchg(&insn_page_in_use, 0); } @@ -56,25 +71,29 @@ struct kprobe_insn_cache kprobe_s390_insn_slots = { static void copy_instruction(struct kprobe *p) { + kprobe_opcode_t insn[MAX_INSN_SIZE]; s64 disp, new_disp; u64 addr, new_addr; + unsigned int len; - memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8)); - p->opcode = p->ainsn.insn[0]; - if (!probe_is_insn_relative_long(p->ainsn.insn)) - return; - /* - * For pc-relative instructions in RIL-b or RIL-c format patch the - * RI2 displacement field. We have already made sure that the insn - * slot for the patched instruction is within the same 2GB area - * as the original instruction (either kernel image or module area). - * Therefore the new displacement will always fit. - */ - disp = *(s32 *)&p->ainsn.insn[1]; - addr = (u64)(unsigned long)p->addr; - new_addr = (u64)(unsigned long)p->ainsn.insn; - new_disp = ((addr + (disp * 2)) - new_addr) / 2; - *(s32 *)&p->ainsn.insn[1] = new_disp; + len = insn_length(*p->addr >> 8); + memcpy(&insn, p->addr, len); + p->opcode = insn[0]; + if (probe_is_insn_relative_long(&insn[0])) { + /* + * For pc-relative instructions in RIL-b or RIL-c format patch + * the RI2 displacement field. We have already made sure that + * the insn slot for the patched instruction is within the same + * 2GB area as the original instruction (either kernel image or + * module area). Therefore the new displacement will always fit. + */ + disp = *(s32 *)&insn[1]; + addr = (u64)(unsigned long)p->addr; + new_addr = (u64)(unsigned long)p->ainsn.insn; + new_disp = ((addr + (disp * 2)) - new_addr) / 2; + *(s32 *)&insn[1] = new_disp; + } + s390_kernel_write(p->ainsn.insn, &insn, len); } NOKPROBE_SYMBOL(copy_instruction); diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S new file mode 100644 index 000000000000..f6cb022ef8c8 --- /dev/null +++ b/arch/s390/kernel/kprobes_insn_page.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> + +/* + * insn_page is a special 4k aligned dummy function for kprobes. + * It will contain all kprobed instructions that are out-of-line executed. + * The page must be within the kernel image to guarantee that the + * out-of-line instructions are within 2GB distance of their original + * location. Using a dummy function ensures that the insn_page is within + * the text section of the kernel and mapped read-only/executable from + * the beginning on, thus avoiding to split large mappings if the page + * would be in the data section instead. + */ + .section .kprobes.text, "ax" + .align 4096 +ENTRY(kprobes_insn_page) + .rept 2048 + .word 0x07fe + .endr +ENDPROC(kprobes_insn_page) + .previous diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index d44e522c569b..4d843e64496f 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -37,7 +37,7 @@ #include <linux/root_dev.h> #include <linux/console.h> #include <linux/kernel_stat.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/device.h> #include <linux/notifier.h> #include <linux/pfn.h> @@ -102,7 +102,6 @@ struct mem_detect_info __bootdata(mem_detect); struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table); struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table); -unsigned long __bootdata_preserved(__swsusp_reset_dma); unsigned long __bootdata_preserved(__stext_dma); unsigned long __bootdata_preserved(__etext_dma); unsigned long __bootdata_preserved(__sdma); @@ -119,6 +118,7 @@ EXPORT_SYMBOL(VMALLOC_END); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); +unsigned long vmemmap_size; unsigned long MODULES_VADDR; unsigned long MODULES_END; @@ -128,6 +128,12 @@ struct lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); /* + * The Write Back bit position in the physaddr is given by the SLPC PCI. + * Leaving the mask zero always uses write through which is safe + */ +unsigned long mio_wb_bit_mask __ro_after_init; + +/* * This is set up by the setup-routine at boot-time * for S390 need to find out, what we have to setup * using address 0x10400 ... @@ -245,7 +251,7 @@ static void __init conmode_default(void) #ifdef CONFIG_CRASH_DUMP static void __init setup_zfcpdump(void) { - if (ipl_info.type != IPL_TYPE_FCP_DUMP) + if (!is_ipl_type_dump()) return; if (OLDMEM_BASE) return; @@ -300,7 +306,7 @@ void machine_power_off(void) void (*pm_power_off)(void) = machine_power_off; EXPORT_SYMBOL_GPL(pm_power_off); -void *restart_stack __section(.data); +void *restart_stack; unsigned long stack_alloc(void) { @@ -366,8 +372,12 @@ void __init arch_call_rest_init(void) static void __init setup_lowcore_dat_off(void) { + unsigned long int_psw_mask = PSW_KERNEL_BITS; struct lowcore *lc; + if (IS_ENABLED(CONFIG_KASAN)) + int_psw_mask |= PSW_MASK_DAT; + /* * Setup lowcore for boot cpu */ @@ -379,15 +389,15 @@ static void __init setup_lowcore_dat_off(void) lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; lc->nodat_stack = ((unsigned long) &init_thread_union) @@ -402,7 +412,6 @@ static void __init setup_lowcore_dat_off(void) memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, sizeof(lc->alt_stfle_fac_list)); nmi_alloc_boot_cpu(lc); - vdso_alloc_boot_cpu(lc); lc->sync_enter_timer = S390_lowcore.sync_enter_timer; lc->async_enter_timer = S390_lowcore.async_enter_timer; lc->exit_timer = S390_lowcore.exit_timer; @@ -552,22 +561,17 @@ static void __init setup_memory_end(void) unsigned long vmax, tmp; /* Choose kernel address space layout: 3 or 4 levels. */ - if (IS_ENABLED(CONFIG_KASAN)) { - vmax = IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) - ? _REGION1_SIZE - : _REGION2_SIZE; - } else { - tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; - tmp = tmp * (sizeof(struct page) + PAGE_SIZE); - if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) - vmax = _REGION2_SIZE; /* 3-level kernel page table */ - else - vmax = _REGION1_SIZE; /* 4-level kernel page table */ - } - + tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; + tmp = tmp * (sizeof(struct page) + PAGE_SIZE); + if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) + vmax = _REGION2_SIZE; /* 3-level kernel page table */ + else + vmax = _REGION1_SIZE; /* 4-level kernel page table */ if (is_prot_virt_host()) adjust_to_uv_max(&vmax); - +#ifdef CONFIG_KASAN + vmax = kasan_vmax; +#endif /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; @@ -586,9 +590,14 @@ static void __init setup_memory_end(void) /* Take care that memory_end is set and <= vmemmap */ memory_end = min(memory_end ?: max_physmem_end, (unsigned long)vmemmap); #ifdef CONFIG_KASAN - /* fit in kasan shadow memory region between 1:1 and vmemmap */ memory_end = min(memory_end, KASAN_SHADOW_START); - vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END); +#endif + vmemmap_size = SECTION_ALIGN_UP(memory_end / PAGE_SIZE) * sizeof(struct page); +#ifdef CONFIG_KASAN + /* move vmemmap above kasan shadow only if stands in a way */ + if (KASAN_SHADOW_END > (unsigned long)vmemmap && + (unsigned long)vmemmap + vmemmap_size > KASAN_SHADOW_START) + vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END); #endif max_pfn = max_low_pfn = PFN_DOWN(memory_end); memblock_remove(memory_end, ULONG_MAX); @@ -1133,8 +1142,7 @@ void __init setup_arch(char **cmdline_p) free_mem_detect_info(); remove_oldmem(); - if (is_prot_virt_host()) - setup_uv(); + setup_uv(); setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); @@ -1178,7 +1186,7 @@ void __init setup_arch(char **cmdline_p) if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_init_branches(); - /* Setup zfcpdump support */ + /* Setup zfcp/nvme dump support */ setup_zfcpdump(); /* Add system specific data to the random pool */ diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index b295090e2ce6..9e900a8977bd 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -535,7 +535,6 @@ void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 85700bd85f98..ebfe86d097f0 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -606,14 +606,14 @@ int smp_store_status(int cpu) /* * Collect CPU state of the previous, crashed system. * There are four cases: - * 1) standard zfcp dump - * condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP + * 1) standard zfcp/nvme dump + * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The boot CPU state is located in * the absolute lowcore of the memory stored in the HSA. The zcore code * will copy the boot CPU state from the HSA. - * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory) - * condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP + * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory) + * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The firmware or the boot-loader * stored the registers of the boot CPU in the absolute lowcore in the @@ -660,7 +660,7 @@ void __init smp_save_dump_cpus(void) unsigned long page; bool is_boot_cpu; - if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP)) + if (!(OLDMEM_BASE || is_ipl_type_dump())) /* No previous system present, normal boot. */ return; /* Allocate a page as dumping area for the store status sigps */ @@ -686,7 +686,7 @@ void __init smp_save_dump_cpus(void) /* Get the vector registers */ smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); /* - * For a zfcp dump OLDMEM_BASE == NULL and the registers + * For a zfcp/nvme dump OLDMEM_BASE == NULL and the registers * of the boot CPU are stored in the HSA. To retrieve * these registers an SCLP request is required which is * done by drivers/s390/char/zcore.c:init_cpu_info() diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 1c3b48165e86..28c168000483 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 513e59d08a55..0ac30ee2c633 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -41,6 +41,9 @@ #include <linux/gfp.h> #include <linux/kprobes.h> #include <linux/uaccess.h> +#include <vdso/vsyscall.h> +#include <vdso/clocksource.h> +#include <vdso/helpers.h> #include <asm/facility.h> #include <asm/delay.h> #include <asm/div64.h> @@ -84,7 +87,7 @@ void __init time_early_init(void) /* Initialize TOD steering parameters */ tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; - vdso_data->ts_end = tod_steering_end; + vdso_data->arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -257,6 +260,7 @@ static struct clocksource clocksource_tod = { .mult = 1000, .shift = 12, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vdso_clock_mode = VDSO_CLOCKMODE_TOD, }; struct clocksource * __init clocksource_default_clock(void) @@ -264,56 +268,6 @@ struct clocksource * __init clocksource_default_clock(void) return &clocksource_tod; } -void update_vsyscall(struct timekeeper *tk) -{ - u64 nsecps; - - if (tk->tkr_mono.clock != &clocksource_tod) - return; - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_wmb(); - vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last; - vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->wtom_clock_sec = - tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec + - + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); - nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift; - while (vdso_data->wtom_clock_nsec >= nsecps) { - vdso_data->wtom_clock_nsec -= nsecps; - vdso_data->wtom_clock_sec++; - } - - vdso_data->xtime_coarse_sec = tk->xtime_sec; - vdso_data->xtime_coarse_nsec = - (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); - vdso_data->wtom_coarse_sec = - vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdso_data->wtom_coarse_nsec = - vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) { - vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC; - vdso_data->wtom_coarse_sec++; - } - - vdso_data->tk_mult = tk->tkr_mono.mult; - vdso_data->tk_shift = tk->tkr_mono.shift; - vdso_data->hrtimer_res = hrtimer_resolution; - smp_wmb(); - ++vdso_data->tb_update_count; -} - -extern struct timezone sys_tz; - -void update_vsyscall_tz(void) -{ - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; -} - /* * Initialize the TOD clock and the CPU timer of * the boot cpu. @@ -342,11 +296,12 @@ void __init time_init(void) } static DEFINE_PER_CPU(atomic_t, clock_sync_word); -static DEFINE_MUTEX(clock_sync_mutex); +static DEFINE_MUTEX(stp_mutex); static unsigned long clock_sync_flags; -#define CLOCK_SYNC_HAS_STP 0 -#define CLOCK_SYNC_STP 1 +#define CLOCK_SYNC_HAS_STP 0 +#define CLOCK_SYNC_STP 1 +#define CLOCK_SYNC_STPINFO_VALID 2 /* * The get_clock function for the physical clock. It will get the current @@ -431,7 +386,6 @@ static void clock_sync_global(unsigned long long delta) /* Epoch overflow */ tod_clock_base[0]++; /* Adjust TOD steering parameters. */ - vdso_data->tb_update_count++; now = get_tod_clock(); adj = tod_steering_end - now; if (unlikely((s64) adj >= 0)) @@ -443,9 +397,8 @@ static void clock_sync_global(unsigned long long delta) panic("TOD clock sync offset %lli is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1; - vdso_data->ts_end = tod_steering_end; - vdso_data->tb_update_count++; + vdso_data->arch_data.tod_steering_end = tod_steering_end; + /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) lpar_offset = qto.tod_epoch_difference; @@ -492,7 +445,6 @@ static struct stp_sstpi stp_info; static void *stp_page; static void stp_work_fn(struct work_struct *work); -static DEFINE_MUTEX(stp_work_mutex); static DECLARE_WORK(stp_work, stp_work_fn); static struct timer_list stp_timer; @@ -583,10 +535,26 @@ void stp_queue_work(void) queue_work(time_sync_wq, &stp_work); } +static int __store_stpinfo(void) +{ + int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + + if (rc) + clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + else + set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + return rc; +} + +static int stpinfo_valid(void) +{ + return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); +} + static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - unsigned long long clock_delta; + unsigned long long clock_delta, flags; static int first; int rc; @@ -599,16 +567,17 @@ static int stp_sync_clock(void *data) if (stp_info.todoff[0] || stp_info.todoff[1] || stp_info.todoff[2] || stp_info.todoff[3] || stp_info.tmd != 2) { + flags = vdso_update_begin(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); if (rc == 0) { sync->clock_delta = clock_delta; clock_sync_global(clock_delta); - rc = chsc_sstpi(stp_page, &stp_info, - sizeof(struct stp_sstpi)); + rc = __store_stpinfo(); if (rc == 0 && stp_info.tmd != 2) rc = -EAGAIN; } + vdso_update_end(flags); } sync->in_sync = rc ? -EAGAIN : 1; xchg(&first, 0); @@ -628,6 +597,81 @@ static int stp_sync_clock(void *data) return 0; } +static int stp_clear_leap(void) +{ + struct __kernel_timex txc; + int ret; + + memset(&txc, 0, sizeof(txc)); + + ret = do_adjtimex(&txc); + if (ret < 0) + return ret; + + txc.modes = ADJ_STATUS; + txc.status &= ~(STA_INS|STA_DEL); + return do_adjtimex(&txc); +} + +static void stp_check_leap(void) +{ + struct stp_stzi stzi; + struct stp_lsoib *lsoib = &stzi.lsoib; + struct __kernel_timex txc; + int64_t timediff; + int leapdiff, ret; + + if (!stp_info.lu || !check_sync_clock()) { + /* + * Either a scheduled leap second was removed by the operator, + * or STP is out of sync. In both cases, clear the leap second + * kernel flags. + */ + if (stp_clear_leap() < 0) + pr_err("failed to clear leap second flags\n"); + return; + } + + if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) { + pr_err("stzi failed\n"); + return; + } + + timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC; + leapdiff = lsoib->nlso - lsoib->also; + + if (leapdiff != 1 && leapdiff != -1) { + pr_err("Cannot schedule %d leap seconds\n", leapdiff); + return; + } + + if (timediff < 0) { + if (stp_clear_leap() < 0) + pr_err("failed to clear leap second flags\n"); + } else if (timediff < 7200) { + memset(&txc, 0, sizeof(txc)); + ret = do_adjtimex(&txc); + if (ret < 0) + return; + + txc.modes = ADJ_STATUS; + if (leapdiff > 0) + txc.status |= STA_INS; + else + txc.status |= STA_DEL; + ret = do_adjtimex(&txc); + if (ret < 0) + pr_err("failed to set leap second flags\n"); + /* arm Timer to clear leap second flags */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); + } else { + /* The day the leap second is scheduled for hasn't been reached. Retry + * in one hour. + */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); + } +} + /* * STP work. Check for the STP state and take over the clock * synchronization if the STP clock source is usable. @@ -638,7 +682,7 @@ static void stp_work_fn(struct work_struct *work) int rc; /* prevent multiple execution. */ - mutex_lock(&stp_work_mutex); + mutex_lock(&stp_mutex); if (!stp_online) { chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); @@ -646,23 +690,22 @@ static void stp_work_fn(struct work_struct *work) goto out_unlock; } - rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL); + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL); if (rc) goto out_unlock; - rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + rc = __store_stpinfo(); if (rc || stp_info.c == 0) goto out_unlock; /* Skip synchronization if the clock is already in sync. */ - if (check_sync_clock()) - goto out_unlock; - - memset(&stp_sync, 0, sizeof(stp_sync)); - cpus_read_lock(); - atomic_set(&stp_sync.cpus, num_online_cpus() - 1); - stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); - cpus_read_unlock(); + if (!check_sync_clock()) { + memset(&stp_sync, 0, sizeof(stp_sync)); + cpus_read_lock(); + atomic_set(&stp_sync.cpus, num_online_cpus() - 1); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); + } if (!check_sync_clock()) /* @@ -670,9 +713,11 @@ static void stp_work_fn(struct work_struct *work) * Retry after a second. */ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); + else if (stp_info.lu) + stp_check_leap(); out_unlock: - mutex_unlock(&stp_work_mutex); + mutex_unlock(&stp_mutex); } /* @@ -687,10 +732,14 @@ static ssize_t ctn_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%016llx\n", - *(unsigned long long *) stp_info.ctnid); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%016llx\n", + *(unsigned long long *) stp_info.ctnid); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(ctn_id); @@ -699,9 +748,13 @@ static ssize_t ctn_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", stp_info.ctn); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.ctn); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(ctn_type); @@ -710,9 +763,13 @@ static ssize_t dst_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x2000)) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x2000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(dst_offset); @@ -721,20 +778,56 @@ static ssize_t leap_seconds_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x8000)) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x8000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(leap_seconds); +static ssize_t leap_seconds_scheduled_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct stp_stzi stzi; + ssize_t ret; + + mutex_lock(&stp_mutex); + if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) { + mutex_unlock(&stp_mutex); + return -ENODATA; + } + + ret = chsc_stzi(stp_page, &stzi, sizeof(stzi)); + mutex_unlock(&stp_mutex); + if (ret < 0) + return ret; + + if (!stzi.lsoib.p) + return sprintf(buf, "0,0\n"); + + return sprintf(buf, "%llu,%d\n", + tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, + stzi.lsoib.nlso - stzi.lsoib.also); +} + +static DEVICE_ATTR_RO(leap_seconds_scheduled); + static ssize_t stratum_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(stratum); @@ -743,9 +836,13 @@ static ssize_t time_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x0800)) - return -ENODATA; - return sprintf(buf, "%i\n", (int) stp_info.tto); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x0800)) + ret = sprintf(buf, "%i\n", (int) stp_info.tto); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(time_offset); @@ -754,9 +851,13 @@ static ssize_t time_zone_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x4000)) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x4000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(time_zone_offset); @@ -765,9 +866,13 @@ static ssize_t timing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", stp_info.tmd); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tmd); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(timing_mode); @@ -776,9 +881,13 @@ static ssize_t timing_state_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", stp_info.tst); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tst); + mutex_unlock(&stp_mutex); + return ret; } static DEVICE_ATTR_RO(timing_state); @@ -801,14 +910,14 @@ static ssize_t online_store(struct device *dev, return -EINVAL; if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) return -EOPNOTSUPP; - mutex_lock(&clock_sync_mutex); + mutex_lock(&stp_mutex); stp_online = value; if (stp_online) set_bit(CLOCK_SYNC_STP, &clock_sync_flags); else clear_bit(CLOCK_SYNC_STP, &clock_sync_flags); queue_work(time_sync_wq, &stp_work); - mutex_unlock(&clock_sync_mutex); + mutex_unlock(&stp_mutex); return count; } @@ -824,6 +933,7 @@ static struct device_attribute *stp_attributes[] = { &dev_attr_dst_offset, &dev_attr_leap_seconds, &dev_attr_online, + &dev_attr_leap_seconds_scheduled, &dev_attr_stratum, &dev_attr_time_offset, &dev_attr_time_zone_offset, diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index c296e5c8dbf9..14bd9d58edc9 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -26,33 +26,10 @@ int __bootdata_preserved(prot_virt_guest); struct uv_info __bootdata_preserved(uv_info); #if IS_ENABLED(CONFIG_KVM) -int prot_virt_host; +int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); EXPORT_SYMBOL(uv_info); -static int __init prot_virt_setup(char *val) -{ - bool enabled; - int rc; - - rc = kstrtobool(val, &enabled); - if (!rc && enabled) - prot_virt_host = 1; - - if (is_prot_virt_guest() && prot_virt_host) { - prot_virt_host = 0; - pr_warn("Protected virtualization not available in protected guests."); - } - - if (prot_virt_host && !test_facility(158)) { - prot_virt_host = 0; - pr_warn("Protected virtualization not supported by the hardware."); - } - - return rc; -} -early_param("prot_virt", prot_virt_setup); - static int __init uv_init(unsigned long stor_base, unsigned long stor_len) { struct uv_cb_init uvcb = { @@ -74,6 +51,24 @@ void __init setup_uv(void) { unsigned long uv_stor_base; + /* + * keep these conditions in line with kasan init code has_uv_sec_stor_limit() + */ + if (!is_prot_virt_host()) + return; + + if (is_prot_virt_guest()) { + prot_virt_host = 0; + pr_warn("Protected virtualization not available in protected guests."); + return; + } + + if (!test_facility(158)) { + prot_virt_host = 0; + pr_warn("Protected virtualization not supported by the hardware."); + return; + } + uv_stor_base = (unsigned long)memblock_alloc_try_nid( uv_info.uv_base_stor_len, SZ_1M, SZ_2G, MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); @@ -98,7 +93,8 @@ fail: void adjust_to_uv_max(unsigned long *vmax) { - *vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr); + if (uv_info.max_sec_stor_addr) + *vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr); } /* @@ -119,6 +115,26 @@ static int uv_pin_shared(unsigned long paddr) } /* + * Requests the Ultravisor to destroy a guest page and make it + * accessible to the host. The destroy clears the page instead of + * exporting. + * + * @paddr: Absolute host address of page to be destroyed + */ +int uv_destroy_page(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_DESTR_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* * Requests the Ultravisor to encrypt a guest page and make it * accessible to the host for paging (export). * diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index c4baefaa6e34..f9da5b149141 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -20,6 +20,8 @@ #include <linux/security.h> #include <linux/memblock.h> #include <linux/compat.h> +#include <linux/binfmts.h> +#include <vdso/datapage.h> #include <asm/asm-offsets.h> #include <asm/processor.h> #include <asm/mmu.h> @@ -96,35 +98,12 @@ static union { struct vdso_data data; u8 page[PAGE_SIZE]; } vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; - -/* - * Setup vdso data page. - */ -static void __init vdso_init_data(struct vdso_data *vd) -{ - vd->ectg_available = test_facility(31); -} - +struct vdso_data *vdso_data = (struct vdso_data *)&vdso_data_store.data; /* * Allocate/free per cpu vdso data. */ #define SEGMENT_ORDER 2 -/* - * The initial vdso_data structure for the boot CPU. Eventually - * it is replaced with a properly allocated structure in vdso_init. - * This is necessary because a valid S390_lowcore.vdso_per_cpu_data - * pointer is required to be able to return from an interrupt or - * program check. See the exit paths in entry.S. - */ -struct vdso_data boot_vdso_data __initdata; - -void __init vdso_alloc_boot_cpu(struct lowcore *lowcore) -{ - lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data; -} - int vdso_alloc_per_cpu(struct lowcore *lowcore) { unsigned long segment_table, page_table, page_frame; @@ -246,8 +225,6 @@ static int __init vdso_init(void) { int i; - vdso_init_data(vdso_data); - /* Calculate the size of the 64 bit vDSO */ vdso64_pages = ((&vdso64_end - &vdso64_start + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 4a66a1cb919b..13cc5a3f9abf 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -1,17 +1,23 @@ # SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso, has to be asm only for now +# List of files in the vdso KCOV_INSTRUMENT := n +ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE +ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT -obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o +include $(srctree)/lib/vdso/Makefile +obj-vdso64 = vdso_user_wrapper.o note.o getcpu.o +obj-cvdso64 = vdso64_generic.o +CFLAGS_REMOVE_vdso64_generic.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) # Build rules -targets := $(obj-vdso64) vdso64.so vdso64.so.dbg +targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) +obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64)) KBUILD_AFLAGS += -DBUILD_VDSO -KBUILD_CFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) KBUILD_AFLAGS_64 += -m64 -s @@ -19,13 +25,13 @@ KBUILD_AFLAGS_64 += -m64 -s KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin ldflags-y := -fPIC -shared -nostdlib -soname=linux-vdso64.so.1 \ - --hash-style=both --build-id -T + --hash-style=both --build-id=sha1 -T $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) obj-y += vdso64_wrapper.o -extra-y += vdso64.lds +targets += vdso64.lds CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) # Disable gcov profiling, ubsan and kasan for VDSO code @@ -37,7 +43,7 @@ KASAN_SANITIZE := n $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so # link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) FORCE +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE $(call if_changed,ld) # strip rule for the .so file @@ -49,9 +55,14 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(obj-vdso64): %.o: %.S FORCE $(call if_changed_dep,vdso64as) +$(obj-cvdso64): %.o: %.c FORCE + $(call if_changed_dep,vdso64cc) + # actual build commands quiet_cmd_vdso64as = VDSO64A $@ cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso64cc = VDSO64C $@ + cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< # install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S deleted file mode 100644 index 0c79caa32b59..000000000000 --- a/arch/s390/kernel/vdso64/clock_getres.S +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of clock_getres() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2008 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> - - .text - .align 4 - .globl __kernel_clock_getres - .type __kernel_clock_getres,@function -__kernel_clock_getres: - CFI_STARTPROC - larl %r1,3f - lg %r0,0(%r1) - cghi %r2,__CLOCK_REALTIME_COARSE - je 0f - cghi %r2,__CLOCK_MONOTONIC_COARSE - je 0f - larl %r1,_vdso_data - llgf %r0,__VDSO_CLOCK_REALTIME_RES(%r1) - cghi %r2,__CLOCK_REALTIME - je 0f - cghi %r2,__CLOCK_MONOTONIC - je 0f - cghi %r2,__CLOCK_THREAD_CPUTIME_ID - je 0f - cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */ - jne 2f - larl %r5,_vdso_data - icm %r0,15,__LC_ECTG_OK(%r5) - jz 2f -0: ltgr %r3,%r3 - jz 1f /* res == NULL */ - xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */ - stg %r0,8(%r3) /* store tp->tv_usec */ -1: lghi %r2,0 - br %r14 -2: lghi %r1,__NR_clock_getres /* fallback to svc */ - svc 0 - br %r14 - CFI_ENDPROC -3: .quad __CLOCK_COARSE_RES - .size __kernel_clock_getres,.-__kernel_clock_getres diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S deleted file mode 100644 index 9d2ee79b90f2..000000000000 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ /dev/null @@ -1,163 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of clock_gettime() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2008 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> -#include <asm/ptrace.h> - - .text - .align 4 - .globl __kernel_clock_gettime - .type __kernel_clock_gettime,@function -__kernel_clock_gettime: - CFI_STARTPROC - aghi %r15,-16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - larl %r5,_vdso_data - cghi %r2,__CLOCK_REALTIME_COARSE - je 4f - cghi %r2,__CLOCK_REALTIME - je 5f - cghi %r2,-3 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */ - je 9f - cghi %r2,__CLOCK_MONOTONIC_COARSE - je 3f - cghi %r2,__CLOCK_MONOTONIC - jne 12f - - /* CLOCK_MONOTONIC */ -0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 0b - stcke 0(%r15) /* Store TOD clock */ - lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ - lg %r0,__VDSO_WTOM_SEC(%r5) - lg %r1,1(%r15) - sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ - alg %r1,__VDSO_WTOM_NSEC(%r5) - srlg %r1,%r1,0(%r2) /* >> tk->shift */ - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 0b - larl %r5,13f -1: clg %r1,0(%r5) - jl 2f - slg %r1,0(%r5) - aghi %r0,1 - j 1b -2: stg %r0,0(%r3) /* store tp->tv_sec */ - stg %r1,8(%r3) /* store tp->tv_nsec */ - lghi %r2,0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - - /* CLOCK_MONOTONIC_COARSE */ - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD -3: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 3b - lg %r0,__VDSO_WTOM_CRS_SEC(%r5) - lg %r1,__VDSO_WTOM_CRS_NSEC(%r5) - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 3b - j 2b - - /* CLOCK_REALTIME_COARSE */ -4: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 4b - lg %r0,__VDSO_XTIME_CRS_SEC(%r5) - lg %r1,__VDSO_XTIME_CRS_NSEC(%r5) - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 4b - j 7f - - /* CLOCK_REALTIME */ -5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 5b - stcke 0(%r15) /* Store TOD clock */ - lg %r1,1(%r15) - lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ - slgr %r0,%r1 /* now - ts_steering_end */ - ltgr %r0,%r0 /* past end of steering ? */ - jm 17f - srlg %r0,%r0,15 /* 1 per 2^16 */ - tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ - jz 18f - lcgr %r0,%r0 /* negative TOD offset */ -18: algr %r1,%r0 /* add steering offset */ -17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ - sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ - alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ - srlg %r1,%r1,0(%r2) /* >> tk->shift */ - lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 5b - larl %r5,13f -6: clg %r1,0(%r5) - jl 7f - slg %r1,0(%r5) - aghi %r0,1 - j 6b -7: stg %r0,0(%r3) /* store tp->tv_sec */ - stg %r1,8(%r3) /* store tp->tv_nsec */ - lghi %r2,0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - - /* CPUCLOCK_VIRT for this thread */ - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD -9: lghi %r4,0 - icm %r0,15,__VDSO_ECTG_OK(%r5) - jz 12f - sacf 256 /* Magic ectg instruction */ - .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 - sacf 0 - algr %r1,%r0 /* r1 = cputime as TOD value */ - mghi %r1,1000 /* convert to nanoseconds */ - srlg %r1,%r1,12 /* r1 = cputime in nanosec */ - lgr %r4,%r1 - larl %r5,13f - srlg %r1,%r1,9 /* divide by 1000000000 */ - mlg %r0,8(%r5) - srlg %r0,%r0,11 /* r0 = tv_sec */ - stg %r0,0(%r3) - msg %r0,0(%r5) /* calculate tv_nsec */ - slgr %r4,%r0 /* r4 = tv_nsec */ - stg %r4,8(%r3) - lghi %r2,0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - - /* Fallback to system call */ - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD -12: lghi %r1,__NR_clock_gettime - svc 0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - CFI_ENDPROC - -13: .quad 1000000000 -14: .quad 19342813113834067 - .size __kernel_clock_gettime,.-__kernel_clock_gettime diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S deleted file mode 100644 index aebe10dc7c99..000000000000 --- a/arch/s390/kernel/vdso64/gettimeofday.S +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of gettimeofday() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2008 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> -#include <asm/ptrace.h> - - .text - .align 4 - .globl __kernel_gettimeofday - .type __kernel_gettimeofday,@function -__kernel_gettimeofday: - CFI_STARTPROC - aghi %r15,-16 - CFI_ADJUST_CFA_OFFSET 16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - larl %r5,_vdso_data -0: ltgr %r3,%r3 /* check if tz is NULL */ - je 1f - mvc 0(8,%r3),__VDSO_TIMEZONE(%r5) -1: ltgr %r2,%r2 /* check if tv is NULL */ - je 4f - lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 0b - stcke 0(%r15) /* Store TOD clock */ - lg %r1,1(%r15) - lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ - slgr %r0,%r1 /* now - ts_steering_end */ - ltgr %r0,%r0 /* past end of steering ? */ - jm 6f - srlg %r0,%r0,15 /* 1 per 2^16 */ - tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ - jz 7f - lcgr %r0,%r0 /* negative TOD offset */ -7: algr %r1,%r0 /* add steering offset */ -6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ - alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ - lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 0b - lgf %r5,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ - srlg %r1,%r1,0(%r5) /* >> tk->shift */ - larl %r5,5f -2: clg %r1,0(%r5) - jl 3f - slg %r1,0(%r5) - aghi %r0,1 - j 2b -3: stg %r0,0(%r2) /* store tv->tv_sec */ - slgr %r0,%r0 /* tv_nsec -> tv_usec */ - ml %r0,8(%r5) - srlg %r0,%r0,6 - stg %r0,8(%r2) /* store tv->tv_usec */ -4: lghi %r2,0 - aghi %r15,16 - CFI_ADJUST_CFA_OFFSET -16 - CFI_RESTORE 15 - br %r14 - CFI_ENDPROC -5: .quad 1000000000 - .long 274877907 - .size __kernel_gettimeofday,.-__kernel_gettimeofday diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c new file mode 100644 index 000000000000..a8cef7e4d137 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64_generic.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../../lib/vdso/gettimeofday.c" + +int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_getres(clock, ts); +} diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S new file mode 100644 index 000000000000..a775d7e52872 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/vdso.h> +#include <asm/unistd.h> +#include <asm/asm-offsets.h> +#include <asm/dwarf.h> +#include <asm/ptrace.h> + +#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8) + +/* + * Older glibc version called vdso without allocating a stackframe. This wrapper + * is just used to allocate a stackframe. See + * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e + * for details. + */ +.macro vdso_func func + .globl __kernel_\func + .type __kernel_\func,@function + .align 8 +__kernel_\func: + CFI_STARTPROC + aghi %r15,-WRAPPER_FRAME_SIZE + CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE) + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + stg %r14,STACK_FRAME_OVERHEAD(%r15) + brasl %r14,__s390_vdso_\func + lg %r14,STACK_FRAME_OVERHEAD(%r15) + aghi %r15,WRAPPER_FRAME_SIZE + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_func gettimeofday +vdso_func clock_getres +vdso_func clock_gettime diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 0e30e6e43b0c..93b3209b94a2 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -333,7 +333,7 @@ EXPORT_SYMBOL(memchr); * memcmp - Compare two areas of memory * @s1: One area of memory * @s2: Another area of memory - * @count: The size of the area. + * @n: The size of the area. */ #ifdef __HAVE_ARCH_MEMCMP int memcmp(const void *s1, const void *s2, size_t n) diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 3175413186b9..cd67e94c16aa 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -8,7 +8,7 @@ obj-y += page-states.o pageattr.o pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o KASAN_SANITIZE_kasan_init.o := n diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index c2ac9b8ae612..8f9ff7e7187d 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/set_memory.h> +#include <linux/ptdump.h> #include <linux/seq_file.h> #include <linux/debugfs.h> -#include <linux/sched.h> #include <linux/mm.h> #include <linux/kasan.h> +#include <asm/ptdump.h> #include <asm/kasan.h> #include <asm/sections.h> @@ -15,264 +17,235 @@ struct addr_marker { }; enum address_markers_idx { - IDENTITY_NR = 0, + IDENTITY_BEFORE_NR = 0, + IDENTITY_BEFORE_END_NR, KERNEL_START_NR, KERNEL_END_NR, + IDENTITY_AFTER_NR, + IDENTITY_AFTER_END_NR, #ifdef CONFIG_KASAN KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif VMEMMAP_NR, + VMEMMAP_END_NR, VMALLOC_NR, + VMALLOC_END_NR, MODULES_NR, + MODULES_END_NR, }; static struct addr_marker address_markers[] = { - [IDENTITY_NR] = {0, "Identity Mapping"}, + [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, + [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, + [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, + [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, #ifdef CONFIG_KASAN [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, #endif - [VMEMMAP_NR] = {0, "vmemmap Area"}, - [VMALLOC_NR] = {0, "vmalloc Area"}, - [MODULES_NR] = {0, "Modules Area"}, + [VMEMMAP_NR] = {0, "vmemmap Area Start"}, + [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, + [VMALLOC_NR] = {0, "vmalloc Area Start"}, + [VMALLOC_END_NR] = {0, "vmalloc Area End"}, + [MODULES_NR] = {0, "Modules Area Start"}, + [MODULES_END_NR] = {0, "Modules Area End"}, { -1, NULL } }; struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; int level; unsigned int current_prot; + bool check_wx; + unsigned long wx_pages; unsigned long start_address; - unsigned long current_address; const struct addr_marker *marker; }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt); \ +}) + static void print_prot(struct seq_file *m, unsigned int pr, int level) { static const char * const level_name[] = { "ASCE", "PGD", "PUD", "PMD", "PTE" }; - seq_printf(m, "%s ", level_name[level]); + pt_dump_seq_printf(m, "%s ", level_name[level]); if (pr & _PAGE_INVALID) { - seq_printf(m, "I\n"); + pt_dump_seq_printf(m, "I\n"); return; } - seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); - seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); + pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); + pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); } -static void note_page(struct seq_file *m, struct pg_state *st, - unsigned int new_prot, int level) +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ +#ifdef CONFIG_DEBUG_WX + if (!st->check_wx) + return; + if (st->current_prot & _PAGE_INVALID) + return; + if (st->current_prot & _PAGE_PROTECT) + return; + if (st->current_prot & _PAGE_NOEXEC) + return; + /* The first lowcore page is currently still W+X. */ + if (addr == PAGE_SIZE) + return; + WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +#endif /* CONFIG_DEBUG_WX */ +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { - static const char units[] = "KMGTPE"; int width = sizeof(unsigned long) * 2; + static const char units[] = "KMGTPE"; const char *unit = units; - unsigned int prot, cur; unsigned long delta; + struct pg_state *st; + struct seq_file *m; + unsigned int prot; - /* - * If we have a "break" in the series, we need to flush the state - * that we have now. "break" is either changing perms, levels or - * address space marker. - */ - prot = new_prot; - cur = st->current_prot; - - if (!st->level) { - /* First entry */ - st->current_prot = new_prot; + st = container_of(pt_st, struct pg_state, ptdump); + m = st->seq; + prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC); + if (level == 4 && (val & _PAGE_INVALID)) + prot = _PAGE_INVALID; + /* For pmd_none() & friends val gets passed as zero. */ + if (level != 4 && !val) + prot = _PAGE_INVALID; + /* Final flush from generic code. */ + if (level == -1) + addr = max_addr; + if (st->level == -1) { + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); + st->start_address = addr; + st->current_prot = prot; st->level = level; - st->marker = address_markers; - seq_printf(m, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || - st->current_address >= st->marker[1].start_address) { - /* Print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - delta = (st->current_address - st->start_address) >> 10; + } else if (prot != st->current_prot || level != st->level || + addr >= st->marker[1].start_address) { + note_prot_wx(st, addr); + pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, addr); + delta = (addr - st->start_address) >> 10; while (!(delta & 0x3ff) && unit[1]) { delta >>= 10; unit++; } - seq_printf(m, "%9lu%c ", delta, *unit); + pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - while (st->current_address >= st->marker[1].start_address) { + while (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(m, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); } - st->start_address = st->current_address; - st->current_prot = new_prot; + st->start_address = addr; + st->current_prot = prot; st->level = level; } } -#ifdef CONFIG_KASAN -static void note_kasan_early_shadow_page(struct seq_file *m, - struct pg_state *st) -{ - unsigned int prot; - - prot = pte_val(*kasan_early_shadow_pte) & - (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); - note_page(m, st, prot, 4); -} -#endif - -/* - * The actual page table walker functions. In order to keep the - * implementation of print_prot() short, we only check and pass - * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region, - * segment or page table entry is invalid or read-only. - * After all it's just a hint that the current level being walked - * contains an invalid or read-only entry. - */ -static void walk_pte_level(struct seq_file *m, struct pg_state *st, - pmd_t *pmd, unsigned long addr) -{ - unsigned int prot; - pte_t *pte; - int i; - - for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { - st->current_address = addr; - pte = pte_offset_kernel(pmd, addr); - prot = pte_val(*pte) & - (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); - note_page(m, st, prot, 4); - addr += PAGE_SIZE; - } -} - -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, - pud_t *pud, unsigned long addr) -{ - unsigned int prot; - pmd_t *pmd; - int i; - -#ifdef CONFIG_KASAN - if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) { - note_kasan_early_shadow_page(m, st); - return; - } -#endif - - pmd = pmd_offset(pud, addr); - for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) { - st->current_address = addr; - if (!pmd_none(*pmd)) { - if (pmd_large(*pmd)) { - prot = pmd_val(*pmd) & - (_SEGMENT_ENTRY_PROTECT | - _SEGMENT_ENTRY_NOEXEC); - note_page(m, st, prot, 3); - } else - walk_pte_level(m, st, pmd, addr); - } else - note_page(m, st, _PAGE_INVALID, 3); - addr += PMD_SIZE; - } -} - -static void walk_pud_level(struct seq_file *m, struct pg_state *st, - p4d_t *p4d, unsigned long addr) +#ifdef CONFIG_DEBUG_WX +void ptdump_check_wx(void) { - unsigned int prot; - pud_t *pud; - int i; + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = NULL, + .level = -1, + .current_prot = 0, + .check_wx = true, + .wx_pages = 0, + .start_address = 0, + .marker = (struct addr_marker[]) { + { .start_address = 0, .name = NULL}, + { .start_address = -1, .name = NULL}, + }, + }; -#ifdef CONFIG_KASAN - if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) { - note_kasan_early_shadow_page(m, st); + if (!MACHINE_HAS_NX) return; - } -#endif - - pud = pud_offset(p4d, addr); - for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) { - st->current_address = addr; - if (!pud_none(*pud)) - if (pud_large(*pud)) { - prot = pud_val(*pud) & - (_REGION_ENTRY_PROTECT | - _REGION_ENTRY_NOEXEC); - note_page(m, st, prot, 2); - } else - walk_pmd_level(m, st, pud, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += PUD_SIZE; - } + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no unexpected W+X pages found\n"); } +#endif /* CONFIG_DEBUG_WX */ -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, - pgd_t *pgd, unsigned long addr) +#ifdef CONFIG_PTDUMP_DEBUGFS +static int ptdump_show(struct seq_file *m, void *v) { - p4d_t *p4d; - int i; - -#ifdef CONFIG_KASAN - if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) { - note_kasan_early_shadow_page(m, st); - return; - } -#endif + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = m, + .level = -1, + .current_prot = 0, + .check_wx = false, + .wx_pages = 0, + .start_address = 0, + .marker = address_markers, + }; - p4d = p4d_offset(pgd, addr); - for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) { - st->current_address = addr; - if (!p4d_none(*p4d)) - walk_pud_level(m, st, p4d, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += P4D_SIZE; - } + get_online_mems(); + mutex_lock(&cpa_mutex); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + mutex_unlock(&cpa_mutex); + put_online_mems(); + return 0; } +DEFINE_SHOW_ATTRIBUTE(ptdump); +#endif /* CONFIG_PTDUMP_DEBUGFS */ -static void walk_pgd_level(struct seq_file *m) +/* + * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple + * insertion sort to preserve the original order of markers with the same + * start address. + */ +static void sort_address_markers(void) { - unsigned long addr = 0; - struct pg_state st; - pgd_t *pgd; - int i; + struct addr_marker tmp; + int i, j; - memset(&st, 0, sizeof(st)); - for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { - st.current_address = addr; - pgd = pgd_offset_k(addr); - if (!pgd_none(*pgd)) - walk_p4d_level(m, &st, pgd, addr); - else - note_page(m, &st, _PAGE_INVALID, 1); - addr += PGDIR_SIZE; - cond_resched(); + for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { + tmp = address_markers[i]; + for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) + address_markers[j + 1] = address_markers[j]; + address_markers[j + 1] = tmp; } - /* Flush out the last page */ - st.current_address = max_addr; - note_page(m, &st, 0, 0); } -static int ptdump_show(struct seq_file *m, void *v) -{ - walk_pgd_level(m); - return 0; -} - -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int pt_dump_init(void) { /* @@ -282,10 +255,17 @@ static int pt_dump_init(void) */ max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); + address_markers[IDENTITY_AFTER_END_NR].start_address = memory_end; address_markers[MODULES_NR].start_address = MODULES_VADDR; + address_markers[MODULES_END_NR].start_address = MODULES_END; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; + address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; + sort_address_markers(); +#ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); +#endif /* CONFIG_PTDUMP_DEBUGFS */ return 0; } device_initcall(pt_dump_init); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 373542ca1113..cfb0017f33a7 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2679,7 +2679,7 @@ static int __s390_reset_acc(pte_t *ptep, unsigned long addr, pte_t pte = READ_ONCE(*ptep); if (pte_present(pte)) - WARN_ON_ONCE(uv_convert_from_secure(pte_val(pte) & PAGE_MASK)); + WARN_ON_ONCE(uv_destroy_page(pte_val(pte) & PAGE_MASK)); return 0; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0d282081dc1f..284939f9661c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -34,6 +34,7 @@ #include <asm/processor.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> +#include <asm/ptdump.h> #include <asm/dma.h> #include <asm/lowcore.h> #include <asm/tlb.h> @@ -45,6 +46,7 @@ #include <asm/kasan.h> #include <asm/dma-mapping.h> #include <asm/uv.h> +#include <linux/virtio_config.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); @@ -129,6 +131,7 @@ void mark_rodata_ro(void) set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); + debug_checkwx(); } int set_memory_encrypted(unsigned long addr, int numpages) @@ -160,6 +163,16 @@ bool force_dma_unencrypted(struct device *dev) return is_prot_virt_guest(); } +#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + +int arch_has_restricted_virtio_memory_access(void) +{ + return is_prot_virt_guest(); +} +EXPORT_SYMBOL(arch_has_restricted_virtio_memory_access); + +#endif + /* protected virtualization */ static void pv_init(void) { diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 99dd1c63a065..5646b39c728a 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -11,7 +11,9 @@ #include <asm/facility.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/uv.h> +unsigned long kasan_vmax; static unsigned long segment_pos __initdata; static unsigned long segment_low __initdata; static unsigned long pgalloc_pos __initdata; @@ -99,8 +101,12 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); if (!has_nx) pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL_EXEC); - sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC); + pgt_prot = pgprot_val(PAGE_KERNEL); + sgt_prot = pgprot_val(SEGMENT_KERNEL); + if (!has_nx || mode == POPULATE_ONE2ONE) { + pgt_prot &= ~_PAGE_NOEXEC; + sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; + } while (address < end) { pg_dir = pgd_offset_k(address); @@ -252,14 +258,31 @@ static void __init kasan_early_detect_facilities(void) } } +static bool __init has_uv_sec_stor_limit(void) +{ + /* + * keep these conditions in line with setup_uv() + */ + if (!is_prot_virt_host()) + return false; + + if (is_prot_virt_guest()) + return false; + + if (!test_facility(158)) + return false; + + return !!uv_info.max_sec_stor_addr; +} + void __init kasan_early_init(void) { unsigned long untracked_mem_end; unsigned long shadow_alloc_size; + unsigned long vmax_unlimited; unsigned long initrd_end; unsigned long asce_type; unsigned long memsize; - unsigned long vmax; unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); pte_t pte_z; pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); @@ -287,7 +310,9 @@ void __init kasan_early_init(void) BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); - untracked_mem_end = vmax = _REGION1_SIZE; + untracked_mem_end = kasan_vmax = vmax_unlimited = _REGION1_SIZE; + if (has_uv_sec_stor_limit()) + kasan_vmax = min(vmax_unlimited, uv_info.max_sec_stor_addr); asce_type = _ASCE_TYPE_REGION2; } else { /* 3 level paging */ @@ -295,7 +320,7 @@ void __init kasan_early_init(void) BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE)); crst_table_init((unsigned long *)early_pg_dir, _REGION3_ENTRY_EMPTY); - untracked_mem_end = vmax = _REGION2_SIZE; + untracked_mem_end = kasan_vmax = vmax_unlimited = _REGION2_SIZE; asce_type = _ASCE_TYPE_REGION3; } @@ -365,17 +390,20 @@ void __init kasan_early_init(void) /* populate kasan shadow (for identity mapping and zero page mapping) */ kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP); if (IS_ENABLED(CONFIG_MODULES)) - untracked_mem_end = vmax - MODULES_LEN; + untracked_mem_end = kasan_vmax - MODULES_LEN; if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - untracked_mem_end = vmax - vmalloc_size - MODULES_LEN; + untracked_mem_end = kasan_vmax - vmalloc_size - MODULES_LEN; /* shallowly populate kasan shadow for vmalloc and modules */ kasan_early_vmemmap_populate(__sha(untracked_mem_end), - __sha(vmax), POPULATE_SHALLOW); + __sha(kasan_vmax), POPULATE_SHALLOW); } /* populate kasan shadow for untracked memory */ kasan_early_vmemmap_populate(__sha(max_physmem_end), __sha(untracked_mem_end), POPULATE_ZERO_SHADOW); + kasan_early_vmemmap_populate(__sha(kasan_vmax), + __sha(vmax_unlimited), + POPULATE_ZERO_SHADOW); /* memory allocated for identity mapping structs will be freed later */ pgalloc_freeable = pgalloc_pos; /* populate identity mapping */ diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index c5c52ec2b46f..ed8e5b3575d5 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -278,7 +278,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, return rc; } -static DEFINE_MUTEX(cpa_mutex); +DEFINE_MUTEX(cpa_mutex); static int change_page_attr(unsigned long addr, unsigned long end, unsigned long flags) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0d25f743b270..18205f851c24 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -24,6 +24,26 @@ #include <asm/mmu_context.h> #include <asm/page-states.h> +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int nodat) { diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index be4b8532dd3c..0a4182792876 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -50,7 +50,6 @@ struct bpf_jit { int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ - int labels[1]; /* Labels for local jumps */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ @@ -229,18 +228,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) REG_SET_SEEN(b3); \ }) -#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \ +#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) -#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \ +#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -1282,7 +1281,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9040000, BPF_REG_0, REG_2); break; } - case BPF_JMP | BPF_TAIL_CALL: + case BPF_JMP | BPF_TAIL_CALL: { + int patch_1_clrj, patch_2_clij, patch_3_brc; + /* * Implicit input: * B1: pointer to ctx @@ -1300,16 +1301,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, offsetof(struct bpf_array, map.max_entries)); /* if ((u32)%b3 >= (u32)%w1) goto out; */ - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clrj %b3,%w1,0xa,label0 */ - EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, - REG_W1, 0, 0xa); - } else { - /* clr %b3,%w1 */ - EMIT2(0x1500, BPF_REG_3, REG_W1); - /* brcl 0xa,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]); - } + /* clrj %b3,%w1,0xa,out */ + patch_1_clrj = jit->prg; + EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa, + jit->prg); /* * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) @@ -1324,16 +1319,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ - EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, - MAX_TAIL_CALL_CNT, 0, 0x2); - } else { - /* clfi %w1,MAX_TAIL_CALL_CNT */ - EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT); - /* brcl 0x2,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]); - } + /* clij %w1,MAX_TAIL_CALL_CNT,0x2,out */ + patch_2_clij = jit->prg; + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT, + 2, jit->prg); /* * prog = array->ptrs[index]; @@ -1348,13 +1337,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* ltg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2, REG_1, offsetof(struct bpf_array, ptrs)); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* brc 0x8,label0 */ - EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]); - } else { - /* brcl 0x8,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]); - } + /* brc 0x8,out */ + patch_3_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); /* * Restore registers before calling function @@ -1371,8 +1356,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* bc 0xf,tail_call_start(%r1) */ _EMIT4(0x47f01000 + jit->tail_call_start); /* out: */ - jit->labels[0] = jit->prg; + if (jit->prg_buf) { + *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = + (jit->prg - patch_1_clrj) >> 1; + *(u16 *)(jit->prg_buf + patch_2_clij + 2) = + (jit->prg - patch_2_clij) >> 1; + *(u16 *)(jit->prg_buf + patch_3_brc + 2) = + (jit->prg - patch_3_brc) >> 1; + } break; + } case BPF_JMP | BPF_EXIT: /* return b0 */ last = (i == fp->len - 1) ? 1 : 0; if (last) diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index b4e3c84772a1..bf557a1b789c 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ pci_bus.o +obj-$(CONFIG_PCI_IOV) += pci_iov.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 1804230dd8d8..570016ae8bcd 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -37,6 +37,7 @@ #include <asm/pci_dma.h> #include "pci_bus.h" +#include "pci_iov.h" /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); @@ -226,7 +227,7 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -void __iomem *ioremap(phys_addr_t addr, size_t size) +static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) { unsigned long offset, vaddr; struct vm_struct *area; @@ -247,14 +248,37 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) return NULL; vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, PAGE_KERNEL)) { + if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); return NULL; } return (void __iomem *) ((unsigned long) area->addr + offset); } + +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) +{ + return __ioremap(addr, size, __pgprot(prot)); +} +EXPORT_SYMBOL(ioremap_prot); + +void __iomem *ioremap(phys_addr_t addr, size_t size) +{ + return __ioremap(addr, size, PAGE_KERNEL); +} EXPORT_SYMBOL(ioremap); +void __iomem *ioremap_wc(phys_addr_t addr, size_t size) +{ + return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL)); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_wt(phys_addr_t addr, size_t size) +{ + return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL)); +} +EXPORT_SYMBOL(ioremap_wt); + void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) @@ -390,15 +414,6 @@ static struct pci_ops pci_root_ops = { .write = pci_write, }; -#ifdef CONFIG_PCI_IOV -static struct resource iov_res = { - .name = "PCI IOV res", - .start = 0, - .end = -1, - .flags = IORESOURCE_MEM, -}; -#endif - static void zpci_map_resources(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); @@ -419,16 +434,7 @@ static void zpci_map_resources(struct pci_dev *pdev) pdev->resource[i].end = pdev->resource[i].start + len - 1; } -#ifdef CONFIG_PCI_IOV - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - int bar = i + PCI_IOV_RESOURCES; - - len = pci_resource_len(pdev, bar); - if (!len) - continue; - pdev->resource[bar].parent = &iov_res; - } -#endif + zpci_iov_map_resources(pdev); } static void zpci_unmap_resources(struct pci_dev *pdev) @@ -684,7 +690,7 @@ void zpci_remove_device(struct zpci_dev *zdev) pdev = pci_get_slot(zbus->bus, zdev->devfn); if (pdev) { if (pdev->is_virtfn) - return zpci_remove_virtfn(pdev, zdev->vfn); + return zpci_iov_remove_virtfn(pdev, zdev->vfn); pci_stop_and_remove_bus_device_locked(pdev); } } @@ -788,6 +794,9 @@ static int zpci_mem_init(void) if (!zpci_iomap_bitmap) goto error_iomap_bitmap; + if (static_branch_likely(&have_mio)) + clp_setup_writeback_mio(); + return 0; error_iomap_bitmap: kfree(zpci_iomap_start); @@ -885,9 +894,3 @@ out: return rc; } subsys_initcall_sync(pci_base_init); - -void zpci_rescan(void) -{ - if (zpci_is_enabled()) - clp_rescan_pci_devices_simple(NULL); -} diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 5967f3014156..755b46f4c595 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -24,6 +24,7 @@ #include <asm/pci_dma.h> #include "pci_bus.h" +#include "pci_iov.h" static LIST_HEAD(zbus_list); static DEFINE_SPINLOCK(zbus_list_lock); @@ -126,69 +127,6 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) return zbus; } -#ifdef CONFIG_PCI_IOV -static int zpci_bus_link_virtfn(struct pci_dev *pdev, - struct pci_dev *virtfn, int vfid) -{ - int rc; - - rc = pci_iov_sysfs_link(pdev, virtfn, vfid); - if (rc) - return rc; - - virtfn->is_virtfn = 1; - virtfn->multifunction = 0; - virtfn->physfn = pci_dev_get(pdev); - - return 0; -} - -static int zpci_bus_setup_virtfn(struct zpci_bus *zbus, - struct pci_dev *virtfn, int vfn) -{ - int i, cand_devfn; - struct zpci_dev *zdev; - struct pci_dev *pdev; - int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ - int rc = 0; - - if (!zbus->multifunction) - return 0; - - /* If the parent PF for the given VF is also configured in the - * instance, it must be on the same zbus. - * We can then identify the parent PF by checking what - * devfn the VF would have if it belonged to that PF using the PF's - * stride and offset. Only if this candidate devfn matches the - * actual devfn will we link both functions. - */ - for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { - zdev = zbus->function[i]; - if (zdev && zdev->is_physfn) { - pdev = pci_get_slot(zbus->bus, zdev->devfn); - if (!pdev) - continue; - cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); - if (cand_devfn == virtfn->devfn) { - rc = zpci_bus_link_virtfn(pdev, virtfn, vfid); - /* balance pci_get_slot() */ - pci_dev_put(pdev); - break; - } - /* balance pci_get_slot() */ - pci_dev_put(pdev); - } - } - return rc; -} -#else -static inline int zpci_bus_setup_virtfn(struct zpci_bus *zbus, - struct pci_dev *virtfn, int vfn) -{ - return 0; -} -#endif - void pcibios_bus_add_device(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); @@ -197,9 +135,10 @@ void pcibios_bus_add_device(struct pci_dev *pdev) * With pdev->no_vf_scan the common PCI probing code does not * perform PF/VF linking. */ - if (zdev->vfn) - zpci_bus_setup_virtfn(zdev->zbus, pdev, zdev->vfn); - + if (zdev->vfn) { + zpci_iov_setup_virtfn(zdev->zbus, pdev, zdev->vfn); + pdev->no_command_memory = 1; + } } static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h index 4972433df458..f8dfac0b5b71 100644 --- a/arch/s390/pci/pci_bus.h +++ b/arch/s390/pci/pci_bus.h @@ -9,7 +9,6 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); void zpci_bus_device_unregister(struct zpci_dev *zdev); -int zpci_bus_init(void); void zpci_release_device(struct kref *kref); static inline void zpci_zdev_put(struct zpci_dev *zdev) @@ -30,15 +29,3 @@ static inline struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus, return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn]; } -#ifdef CONFIG_PCI_IOV -static inline void zpci_remove_virtfn(struct pci_dev *pdev, int vfn) -{ - - pci_lock_rescan_remove(); - /* Linux' vfid's start at 0 vfn at 1 */ - pci_iov_remove_virtfn(pdev->physfn, vfn - 1); - pci_unlock_rescan_remove(); -} -#else /* CONFIG_PCI_IOV */ -static inline void zpci_remove_virtfn(struct pci_dev *pdev, int vfn) {} -#endif /* CONFIG_PCI_IOV */ diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 7e735f41a0a6..153720d21ae7 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -102,6 +102,7 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->msi_addr = response->msia; zdev->max_msi = response->noi; zdev->fmb_update = response->mui; + zdev->version = response->version; switch (response->version) { case 1: @@ -167,6 +168,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, if (response->util_str_avail) { memcpy(zdev->util_str, response->util_str, sizeof(zdev->util_str)); + zdev->util_str_avail = 1; } zdev->mio_capable = response->mio_addr_avail; for (i = 0; i < PCI_STD_NUM_BARS; i++) { @@ -244,6 +246,7 @@ error: return rc; } +static int clp_refresh_fh(u32 fid); /* * Enable/Disable a given PCI function and update its function handle if * necessary @@ -286,7 +289,41 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u8 nr_dma_as, u8 command) } else if (!rc && rrb->response.hdr.rsp == CLP_RC_SETPCIFN_ALRDY && rrb->response.fh == 0) { /* Function is already in desired state - update handle */ - rc = clp_rescan_pci_devices_simple(&fid); + rc = clp_refresh_fh(fid); + } + clp_free_block(rrb); + return rc; +} + +int clp_setup_writeback_mio(void) +{ + struct clp_req_rsp_slpc_pci *rrb; + u8 wb_bit_pos; + int rc; + + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_SLPC; + rrb->response.hdr.len = sizeof(rrb->response); + + rc = clp_req(rrb, CLP_LPS_PCI); + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { + if (rrb->response.vwb) { + wb_bit_pos = rrb->response.mio_wb; + set_bit_inv(wb_bit_pos, &mio_wb_bit_mask); + zpci_dbg(3, "wb bit: %d\n", wb_bit_pos); + } else { + zpci_dbg(3, "wb bit: n.a.\n"); + } + + } else { + zpci_err("SLPC PCI:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + rc = -EIO; } clp_free_block(rrb); return rc; @@ -374,24 +411,6 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data) clp_add_pci_device(entry->fid, entry->fh, entry->config_state); } -static void __clp_update(struct clp_fh_list_entry *entry, void *data) -{ - struct zpci_dev *zdev; - u32 *fid = data; - - if (!entry->vendor_id) - return; - - if (fid && *fid != entry->fid) - return; - - zdev = get_zdev_by_fid(entry->fid); - if (!zdev) - return; - - zdev->fh = entry->fh; -} - int clp_scan_pci_devices(void) { struct clp_req_rsp_list_pci *rrb; @@ -407,27 +426,25 @@ int clp_scan_pci_devices(void) return rc; } -int clp_rescan_pci_devices(void) +static void __clp_refresh_fh(struct clp_fh_list_entry *entry, void *data) { - struct clp_req_rsp_list_pci *rrb; - int rc; - - zpci_remove_reserved_devices(); + struct zpci_dev *zdev; + u32 fid = *((u32 *)data); - rrb = clp_alloc_block(GFP_KERNEL); - if (!rrb) - return -ENOMEM; + if (!entry->vendor_id || fid != entry->fid) + return; - rc = clp_list_pci(rrb, NULL, __clp_add); + zdev = get_zdev_by_fid(fid); + if (!zdev) + return; - clp_free_block(rrb); - return rc; + zdev->fh = entry->fh; } -/* Rescan PCI functions and refresh function handles. If fid is non-NULL only - * refresh the handle of the function matching @fid +/* + * Refresh the function handle of the function matching @fid */ -int clp_rescan_pci_devices_simple(u32 *fid) +static int clp_refresh_fh(u32 fid) { struct clp_req_rsp_list_pci *rrb; int rc; @@ -436,7 +453,7 @@ int clp_rescan_pci_devices_simple(u32 *fid) if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, fid, __clp_update); + rc = clp_list_pci(rrb, &fid, __clp_refresh_fh); clp_free_block(rrb); return rc; @@ -495,7 +512,7 @@ static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb) } } -static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb) +static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc_pci *lpcb) { unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 64b1399a73f0..ebc9a49523aa 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/iommu-helper.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/vmalloc.h> #include <linux/pci.h> #include <asm/pci_dma.h> @@ -261,13 +261,11 @@ static unsigned long __dma_alloc_iommu(struct device *dev, unsigned long start, int size) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long boundary_size; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages, start, size, zdev->start_dma >> PAGE_SHIFT, - boundary_size, 0); + dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT), + 0); } static dma_addr_t dma_alloc_address(struct device *dev, int size) @@ -670,6 +668,8 @@ const struct dma_map_ops s390_pci_dma_ops = { .unmap_page = s390_dma_unmap_pages, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, /* dma_supported is unconditionally true without a callback */ }; EXPORT_SYMBOL_GPL(s390_pci_dma_ops); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d9ae7456dd4c..d33f21545dfd 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -152,7 +152,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) } break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ - clp_rescan_pci_devices(); + zpci_remove_reserved_devices(); + clp_scan_pci_devices(); break; case 0x0308: /* Standby -> Reserved */ if (!zdev) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c new file mode 100644 index 000000000000..ead062bf2b41 --- /dev/null +++ b/arch/s390/pci/pci_iov.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "pci_iov.h" + +static struct resource iov_res = { + .name = "PCI IOV res", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, +}; + +void zpci_iov_map_resources(struct pci_dev *pdev) +{ + resource_size_t len; + int i; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + int bar = i + PCI_IOV_RESOURCES; + + len = pci_resource_len(pdev, bar); + if (!len) + continue; + pdev->resource[bar].parent = &iov_res; + } +} + +void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) +{ + pci_lock_rescan_remove(); + /* Linux' vfid's start at 0 vfn at 1 */ + pci_iov_remove_virtfn(pdev->physfn, vfn - 1); + pci_unlock_rescan_remove(); +} + +static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, int vfid) +{ + int rc; + + rc = pci_iov_sysfs_link(pdev, virtfn, vfid); + if (rc) + return rc; + + virtfn->is_virtfn = 1; + virtfn->multifunction = 0; + virtfn->physfn = pci_dev_get(pdev); + + return 0; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + int i, cand_devfn; + struct zpci_dev *zdev; + struct pci_dev *pdev; + int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ + int rc = 0; + + if (!zbus->multifunction) + return 0; + + /* If the parent PF for the given VF is also configured in the + * instance, it must be on the same zbus. + * We can then identify the parent PF by checking what + * devfn the VF would have if it belonged to that PF using the PF's + * stride and offset. Only if this candidate devfn matches the + * actual devfn will we link both functions. + */ + for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { + zdev = zbus->function[i]; + if (zdev && zdev->is_physfn) { + pdev = pci_get_slot(zbus->bus, zdev->devfn); + if (!pdev) + continue; + cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); + if (cand_devfn == virtfn->devfn) { + rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); + /* balance pci_get_slot() */ + pci_dev_put(pdev); + break; + } + /* balance pci_get_slot() */ + pci_dev_put(pdev); + } + } + return rc; +} diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h new file mode 100644 index 000000000000..b2c828003bad --- /dev/null +++ b/arch/s390/pci/pci_iov.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + * + */ + +#ifndef __S390_PCI_IOV_H +#define __S390_PCI_IOV_H + +#ifdef CONFIG_PCI_IOV +void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn); + +void zpci_iov_map_resources(struct pci_dev *pdev); + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); + +#else /* CONFIG_PCI_IOV */ +static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} + +static inline void zpci_iov_map_resources(struct pci_dev *pdev) {} + +static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + return 0; +} +#endif /* CONFIG_PCI_IOV */ +#endif /* __S390_PCI_IOV_h */ diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss deleted file mode 100644 index f4f4c2c6dee9..000000000000 --- a/arch/s390/scripts/Makefile.chkbss +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -chkbss-target ?= built-in.a -$(obj)/$(chkbss-target): chkbss - -chkbss-files := $(addsuffix .chkbss, $(chkbss)) -clean-files += $(chkbss-files) - -PHONY += chkbss -chkbss: $(addprefix $(obj)/, $(chkbss-files)) - -quiet_cmd_chkbss = CHKBSS $< - cmd_chkbss = \ - if ! $(OBJSIZE) --common $< | $(AWK) 'END { if ($$3) exit 1 }'; then \ - echo "error: $< .bss section is not empty" >&2; exit 1; \ - fi; \ - touch $@; - -$(obj)/%.o.chkbss: $(obj)/%.o - $(call cmd,chkbss) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 18278152c91c..159da4ed578f 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -71,6 +71,7 @@ config SUPERH select PERF_EVENTS select PERF_USE_VMALLOC select RTC_LIB + select SET_FS select SPARSE_IRQ help The SuperH is a RISC processor targeted for use in embedded systems diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index 665cad452798..bac8a058ebd7 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -13,6 +13,7 @@ #include <cpu/sh7723.h> +#include <linux/dma-map-ops.h> #include <linux/clkdev.h> #include <linux/delay.h> #include <linux/device.h> diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index dd427bac5cde..bab91a99124e 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -36,6 +36,7 @@ #include <linux/usb/r8a66597.h> #include <linux/usb/renesas_usbhs.h> #include <linux/videodev2.h> +#include <linux/dma-map-ops.h> #include <media/drv-intf/renesas-ceu.h> #include <media/i2c/mt9t112.h> diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index 96538ba3aa32..eeb5ce341efd 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -14,7 +14,6 @@ #include <linux/clkdev.h> #include <linux/delay.h> -#include <linux/dma-mapping.h> #include <linux/gpio.h> #include <linux/gpio/machine.h> #include <linux/i2c.h> @@ -33,6 +32,7 @@ #include <linux/sh_intc.h> #include <linux/usb/r8a66597.h> #include <linux/videodev2.h> +#include <linux/dma-map-ops.h> #include <mach/kfr2r09.h> diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index 9ed369dad62d..6703a2122c0d 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -5,7 +5,7 @@ * Copyright (C) 2008 Magnus Damm */ #include <linux/clkdev.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/platform_device.h> #include <linux/interrupt.h> diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index 32f5dd944889..8d6541ba0186 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -32,6 +32,7 @@ #include <linux/smc91x.h> #include <linux/usb/r8a66597.h> #include <linux/videodev2.h> +#include <linux/dma-map-ops.h> #include <mach-se/mach/se7724.h> #include <media/drv-intf/renesas-ceu.h> diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c index 7be8694c0d13..41e4daee8f04 100644 --- a/arch/sh/drivers/pci/fixups-dreamcast.c +++ b/arch/sh/drivers/pci/fixups-dreamcast.c @@ -19,7 +19,7 @@ #include <linux/init.h> #include <linux/irq.h> #include <linux/pci.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <asm/io.h> #include <asm/irq.h> diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index 6ab0b7377f66..a3903304f33f 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -13,7 +13,6 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/types.h> -#include <linux/dma-debug.h> #include <linux/io.h> #include <linux/mutex.h> #include <linux/spinlock.h> diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c index e0b568aaa701..4468289ab2ca 100644 --- a/arch/sh/drivers/pci/pcie-sh7786.c +++ b/arch/sh/drivers/pci/pcie-sh7786.c @@ -12,6 +12,7 @@ #include <linux/io.h> #include <linux/async.h> #include <linux/delay.h> +#include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/clk.h> #include <linux/sh_clk.h> @@ -31,6 +32,8 @@ struct sh7786_pcie_port { static struct sh7786_pcie_port *sh7786_pcie_ports; static unsigned int nr_ports; static unsigned long dma_pfn_offset; +size_t memsize; +u64 memstart; static struct sh7786_pcie_hwops { int (*core_init)(void); @@ -301,7 +304,6 @@ static int __init pcie_init(struct sh7786_pcie_port *port) struct pci_channel *chan = port->hose; unsigned int data; phys_addr_t memstart, memend; - size_t memsize; int ret, i, win; /* Begin initialization */ @@ -368,8 +370,6 @@ static int __init pcie_init(struct sh7786_pcie_port *port) memstart = ALIGN_DOWN(memstart, memsize); memsize = roundup_pow_of_two(memend - memstart); - dma_pfn_offset = memstart >> PAGE_SHIFT; - /* * If there's more than 512MB of memory, we need to roll over to * LAR1/LAMR1. @@ -487,7 +487,8 @@ int pcibios_map_platform_irq(const struct pci_dev *pdev, u8 slot, u8 pin) void pcibios_bus_add_device(struct pci_dev *pdev) { - pdev->dev.dma_pfn_offset = dma_pfn_offset; + dma_direct_set_offset(&pdev->dev, __pa(memory_start), + __pa(memory_start) - memstart, memsize); } static int __init sh7786_pcie_core_init(void) diff --git a/arch/sh/kernel/dma-coherent.c b/arch/sh/kernel/dma-coherent.c index cd46a9825e3c..6a44c0e7ba40 100644 --- a/arch/sh/kernel/dma-coherent.c +++ b/arch/sh/kernel/dma-coherent.c @@ -3,7 +3,7 @@ * Copyright (C) 2004 - 2007 Paul Mundt */ #include <linux/mm.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <asm/cacheflush.h> #include <asm/addrspace.h> diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 4fe3f00137bc..1add47fd31f6 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -502,8 +502,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, save_r0); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index ae0a00beea5f..783738448ff5 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 096530eac8e1..a6ca135442f9 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -51,6 +51,7 @@ config SPARC select LOCKDEP_SMALL if LOCKDEP select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH + select SET_FS config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h index 9a52d9506f80..549f0a72280d 100644 --- a/arch/sparc/include/asm/io_32.h +++ b/arch/sparc/include/asm/io_32.h @@ -11,6 +11,13 @@ #define memcpy_fromio(d,s,sz) _memcpy_fromio(d,s,sz) #define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz) +/* + * Bus number may be embedded in the higher bits of the physical address. + * This is why we have no bus number argument to ioremap(). + */ +void __iomem *ioremap(phys_addr_t offset, size_t size); +void iounmap(volatile void __iomem *addr); + #include <asm-generic/io.h> static inline void _memset_io(volatile void __iomem *dst, @@ -121,14 +128,6 @@ static inline void sbus_memcpy_toio(volatile void __iomem *dst, } } -#ifdef __KERNEL__ - -/* - * Bus number may be embedded in the higher bits of the physical address. - * This is why we have no bus number argument to ioremap(). - */ -void __iomem *ioremap(phys_addr_t offset, size_t size); -void iounmap(volatile void __iomem *addr); /* Create a virtual mapping cookie for an IO port range */ void __iomem *ioport_map(unsigned long port, unsigned int nr); void ioport_unmap(void __iomem *); @@ -148,8 +147,6 @@ static inline int sbus_can_burst64(void) struct device; void sbus_set_sbus64(struct device *, int); -#endif - #define __ARCH_HAS_NO_PAGE_ZERO_MAPPED 1 diff --git a/arch/sparc/kernel/iommu-common.c b/arch/sparc/kernel/iommu-common.c index 59cb16691322..23ca75f09277 100644 --- a/arch/sparc/kernel/iommu-common.c +++ b/arch/sparc/kernel/iommu-common.c @@ -166,13 +166,6 @@ unsigned long iommu_tbl_range_alloc(struct device *dev, } } - if (dev) - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - 1 << iommu->table_shift); - else - boundary_size = ALIGN(1ULL << 32, 1 << iommu->table_shift); - - boundary_size = boundary_size >> iommu->table_shift; /* * if the skip_span_boundary_check had been set during init, we set * things up so that iommu_is_span_boundary() merely checks if the @@ -181,6 +174,9 @@ unsigned long iommu_tbl_range_alloc(struct device *dev, if ((iommu->flags & IOMMU_NO_SPAN_BOUND) != 0) { shift = 0; boundary_size = iommu->poolsize * iommu->nr_pools; + } else { + boundary_size = dma_get_seg_boundary_nr_pages(dev, + iommu->table_shift); } n = iommu_area_alloc(iommu->map, limit, start, npages, shift, boundary_size, align_mask); diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 4ae7388b1bff..a034f571d869 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include <linux/delay.h> #include <linux/device.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/errno.h> #include <linux/iommu-helper.h> #include <linux/bitmap.h> @@ -472,8 +472,7 @@ static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist, outs->dma_length = 0; max_seg_size = dma_get_max_seg_size(dev); - seg_boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - IO_PAGE_SIZE) >> IO_PAGE_SHIFT; + seg_boundary_size = dma_get_seg_boundary_nr_pages(dev, IO_PAGE_SHIFT); base_shift = iommu->tbl.table_map_base >> IO_PAGE_SHIFT; for_each_sg(sglist, s, nelems, i) { unsigned long paddr, npages, entry, out_entry = 0, slen; diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index d6874c9b639f..8e1d72a16759 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -38,7 +38,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/scatterlist.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/of_device.h> #include <asm/io.h> diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 14b93c5564e3..9de57e88f7a1 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -16,6 +16,7 @@ #include <linux/export.h> #include <linux/log2.h> #include <linux/of_device.h> +#include <linux/dma-map-ops.h> #include <asm/iommu-common.h> #include <asm/iommu.h> @@ -508,8 +509,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, iommu_batch_start(dev, prot, ~0UL); max_seg_size = dma_get_max_seg_size(dev); - seg_boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - IO_PAGE_SIZE) >> IO_PAGE_SHIFT; + seg_boundary_size = dma_get_seg_boundary_nr_pages(dev, IO_PAGE_SHIFT); mask = *dev->dma_mask; if (!iommu_use_atu(iommu, mask)) diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index 5234b5ccc0b9..0442ab00518d 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -25,7 +25,7 @@ asmlinkage long sparc_fork(struct pt_regs *regs) .stack = regs->u_regs[UREG_FP], }; - ret = _do_fork(&args); + ret = kernel_clone(&args); /* If we get an error and potentially restart the system * call, we're screwed because copy_thread() clobbered @@ -50,7 +50,7 @@ asmlinkage long sparc_vfork(struct pt_regs *regs) .stack = regs->u_regs[UREG_FP], }; - ret = _do_fork(&args); + ret = kernel_clone(&args); /* If we get an error and potentially restart the system * call, we're screwed because copy_thread() clobbered @@ -96,7 +96,7 @@ asmlinkage long sparc_clone(struct pt_regs *regs) else args.stack = regs->u_regs[UREG_FP]; - ret = _do_fork(&args); + ret = kernel_clone(&args); /* If we get an error and potentially restart the system * call, we're screwed because copy_thread() clobbered diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index d0e0025ee3ba..741d0701003a 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -523,10 +523,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, { if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } asmlinkage int do_sys_sigstack(struct sigstack __user *ssptr, diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 255264bcb46a..f7ef7edcd5c1 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -551,10 +551,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long uprobe_notify_resume(regs); if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } user_enter(); } diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e286e2badc8a..e38d8bf454e8 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1039,38 +1039,9 @@ void smp_fetch_global_pmu(void) * are flush_tlb_*() routines, and these run after flush_cache_*() * which performs the flushw. * - * The SMP TLB coherency scheme we use works as follows: - * - * 1) mm->cpu_vm_mask is a bit mask of which cpus an address - * space has (potentially) executed on, this is the heuristic - * we use to avoid doing cross calls. - * - * Also, for flushing from kswapd and also for clones, we - * use cpu_vm_mask as the list of cpus to make run the TLB. - * - * 2) TLB context numbers are shared globally across all processors - * in the system, this allows us to play several games to avoid - * cross calls. - * - * One invariant is that when a cpu switches to a process, and - * that processes tsk->active_mm->cpu_vm_mask does not have the - * current cpu's bit set, that tlb context is flushed locally. - * - * If the address space is non-shared (ie. mm->count == 1) we avoid - * cross calls when we want to flush the currently running process's - * tlb state. This is done by clearing all cpu bits except the current - * processor's in current->mm->cpu_vm_mask and performing the - * flush locally only. This will force any subsequent cpus which run - * this task to flush the context from the local tlb if the process - * migrates to another cpu (again). - * - * 3) For shared address spaces (threads) and swapping we bite the - * bullet for most cases and perform the cross call (but only to - * the cpus listed in cpu_vm_mask). - * - * The performance gain from "optimizing" away the cross call for threads is - * questionable (in theory the big win for threads is the massive sharing of - * address space state across processors). + * mm->cpu_vm_mask is a bit mask of which cpus an address + * space has (potentially) executed on, this is the heuristic + * we use to limit cross calls. */ /* This currently is only used by the hugetlb arch pre-fault @@ -1080,18 +1051,13 @@ void smp_fetch_global_pmu(void) void smp_flush_tlb_mm(struct mm_struct *mm) { u32 ctx = CTX_HWBITS(mm->context); - int cpu = get_cpu(); - if (atomic_read(&mm->mm_users) == 1) { - cpumask_copy(mm_cpumask(mm), cpumask_of(cpu)); - goto local_flush_and_out; - } + get_cpu(); smp_cross_call_masked(&xcall_flush_tlb_mm, ctx, 0, 0, mm_cpumask(mm)); -local_flush_and_out: __flush_tlb_mm(ctx, SECONDARY_CONTEXT); put_cpu(); @@ -1114,17 +1080,15 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long { u32 ctx = CTX_HWBITS(mm->context); struct tlb_pending_info info; - int cpu = get_cpu(); + + get_cpu(); info.ctx = ctx; info.nr = nr; info.vaddrs = vaddrs; - if (mm == current->mm && atomic_read(&mm->mm_users) == 1) - cpumask_copy(mm_cpumask(mm), cpumask_of(cpu)); - else - smp_call_function_many(mm_cpumask(mm), tlb_pending_func, - &info, 1); + smp_call_function_many(mm_cpumask(mm), tlb_pending_func, + &info, 1); __flush_tlb_pending(ctx, nr, vaddrs); @@ -1134,14 +1098,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr) { unsigned long context = CTX_HWBITS(mm->context); - int cpu = get_cpu(); - if (mm == current->mm && atomic_read(&mm->mm_users) == 1) - cpumask_copy(mm_cpumask(mm), cpumask_of(cpu)); - else - smp_cross_call_masked(&xcall_flush_tlb_page, - context, vaddr, 0, - mm_cpumask(mm)); + get_cpu(); + + smp_cross_call_masked(&xcall_flush_tlb_page, + context, vaddr, 0, + mm_cpumask(mm)); + __flush_tlb_page(context, vaddr); put_cpu(); diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 37ec52b34c73..78160260991b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -485,3 +485,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 430a47a1b6ae..bf3e6d2fe5d9 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -11,7 +11,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/bitops.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/of.h> #include <linux/of_device.h> diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 3a388b1c5d4b..0c0342e5b10d 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -12,7 +12,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/of.h> #include <linux/of_device.h> diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index f44355e46f31..c5e1545bc5cf 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -3,8 +3,6 @@ # Building vDSO images for sparc. # -KBUILD_CFLAGS += $(DISABLE_LTO) - VDSO64-$(CONFIG_SPARC64) := y VDSOCOMPAT-$(CONFIG_COMPAT) := y @@ -115,7 +113,7 @@ quiet_cmd_vdso = VDSO $@ -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(OBJDUMP)' '$@' -VDSO_LDFLAGS = -shared --hash-style=both --build-id -Bsymbolic +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic GCOV_PROFILE := n # diff --git a/arch/um/Kconfig b/arch/um/Kconfig index d49f471b02e3..4b799fad8b48 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -19,6 +19,7 @@ config UML select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS select HAVE_GCC_PLUGINS + select SET_FS select TTY # Needed for line.c config MMU @@ -62,12 +63,12 @@ config NR_CPUS source "arch/$(HEADER_ARCH)/um/Kconfig" -config FORBID_STATIC_LINK - bool +config MAY_HAVE_RUNTIME_DEPS + bool config STATIC_LINK bool "Force a static link" - depends on !FORBID_STATIC_LINK + depends on CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS || !MAY_HAVE_RUNTIME_DEPS help This option gives you the ability to force a static link of UML. Normally, UML is linked as a shared binary. This is inconvenient for diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 9160ead56e33..2e7b8e0e7194 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -234,7 +234,7 @@ config UML_NET_DAEMON config UML_NET_VECTOR bool "Vector I/O high performance network devices" depends on UML_NET - select FORBID_STATIC_LINK + select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network driver uses multi-message send and receive functions. The host running the UML guest must have @@ -246,7 +246,7 @@ config UML_NET_VECTOR config UML_NET_VDE bool "VDE transport (obsolete)" depends on UML_NET - select FORBID_STATIC_LINK + select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network transport allows one or more running UMLs on a single host to communicate with each other and also @@ -294,7 +294,7 @@ config UML_NET_MCAST config UML_NET_PCAP bool "pcap transport (obsolete)" depends on UML_NET - select FORBID_STATIC_LINK + select MAY_HAVE_RUNTIME_DEPS help The pcap transport makes a pcap packet stream on the host look like an ethernet device inside UML. This is useful for making diff --git a/arch/um/drivers/daemon_user.c b/arch/um/drivers/daemon_user.c index 3695821d06a2..785baedc3555 100644 --- a/arch/um/drivers/daemon_user.c +++ b/arch/um/drivers/daemon_user.c @@ -7,6 +7,7 @@ */ #include <stdint.h> +#include <string.h> #include <unistd.h> #include <errno.h> #include <sys/types.h> diff --git a/arch/um/drivers/pcap_user.c b/arch/um/drivers/pcap_user.c index bbd20638788a..52ddda3e3b10 100644 --- a/arch/um/drivers/pcap_user.c +++ b/arch/um/drivers/pcap_user.c @@ -32,7 +32,7 @@ static int pcap_user_init(void *data, void *dev) return 0; } -static int pcap_open(void *data) +static int pcap_user_open(void *data) { struct pcap_data *pri = data; __u32 netmask; @@ -44,14 +44,14 @@ static int pcap_open(void *data) if (pri->filter != NULL) { err = dev_netmask(pri->dev, &netmask); if (err < 0) { - printk(UM_KERN_ERR "pcap_open : dev_netmask failed\n"); + printk(UM_KERN_ERR "pcap_user_open : dev_netmask failed\n"); return -EIO; } pri->compiled = uml_kmalloc(sizeof(struct bpf_program), UM_GFP_KERNEL); if (pri->compiled == NULL) { - printk(UM_KERN_ERR "pcap_open : kmalloc failed\n"); + printk(UM_KERN_ERR "pcap_user_open : kmalloc failed\n"); return -ENOMEM; } @@ -59,14 +59,14 @@ static int pcap_open(void *data) (struct bpf_program *) pri->compiled, pri->filter, pri->optimize, netmask); if (err < 0) { - printk(UM_KERN_ERR "pcap_open : pcap_compile failed - " + printk(UM_KERN_ERR "pcap_user_open : pcap_compile failed - " "'%s'\n", pcap_geterr(pri->pcap)); goto out; } err = pcap_setfilter(pri->pcap, pri->compiled); if (err < 0) { - printk(UM_KERN_ERR "pcap_open : pcap_setfilter " + printk(UM_KERN_ERR "pcap_user_open : pcap_setfilter " "failed - '%s'\n", pcap_geterr(pri->pcap)); goto out; } @@ -127,7 +127,7 @@ int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri) const struct net_user_info pcap_user_info = { .init = pcap_user_init, - .open = pcap_open, + .open = pcap_user_open, .close = NULL, .remove = pcap_remove, .add_address = NULL, diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c index 8016d32b6809..482a19c5105c 100644 --- a/arch/um/drivers/slip_user.c +++ b/arch/um/drivers/slip_user.c @@ -9,7 +9,7 @@ #include <errno.h> #include <fcntl.h> #include <string.h> -#include <sys/termios.h> +#include <termios.h> #include <sys/wait.h> #include <net_user.h> #include <os.h> diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 8735c468230a..555203e3e7b4 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1403,7 +1403,7 @@ static int vector_net_load_bpf_flash(struct net_device *dev, kfree(vp->bpf->filter); vp->bpf->filter = NULL; } else { - vp->bpf = kmalloc(sizeof(struct sock_fprog), GFP_KERNEL); + vp->bpf = kmalloc(sizeof(struct sock_fprog), GFP_ATOMIC); if (vp->bpf == NULL) { netdev_err(dev, "failed to allocate memory for firmware\n"); goto flash_fail; @@ -1415,7 +1415,7 @@ static int vector_net_load_bpf_flash(struct net_device *dev, if (request_firmware(&fw, efl->data, &vdevice->pdev.dev)) goto flash_fail; - vp->bpf->filter = kmemdup(fw->data, fw->size, GFP_KERNEL); + vp->bpf->filter = kmemdup(fw->data, fw->size, GFP_ATOMIC); if (!vp->bpf->filter) goto free_buffer; diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index c4a0f26b2824..bae53220ce26 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -18,9 +18,7 @@ #include <fcntl.h> #include <sys/socket.h> #include <sys/un.h> -#include <net/ethernet.h> #include <netinet/ip.h> -#include <netinet/ether.h> #include <linux/if_ether.h> #include <linux/if_packet.h> #include <sys/wait.h> @@ -39,6 +37,7 @@ #define ID_MAX 2 #define TOKEN_IFNAME "ifname" +#define TOKEN_SCRIPT "ifup" #define TRANS_RAW "raw" #define TRANS_RAW_LEN strlen(TRANS_RAW) @@ -55,6 +54,9 @@ #define MAX_UN_LEN 107 +static const char padchar[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +static const char *template = "tapXXXXXX"; + /* This is very ugly and brute force lookup, but it is done * only once at initialization so not worth doing hashes or * anything more intelligent @@ -191,16 +193,21 @@ raw_fd_cleanup: return err; } + static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) { - int fd = -1; + int fd = -1, i; char *iface; struct vector_fds *result = NULL; + bool dynamic = false; + char dynamic_ifname[IFNAMSIZ]; + char *argv[] = {NULL, NULL, NULL, NULL}; iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); if (iface == NULL) { - printk(UM_KERN_ERR "uml_tap: failed to parse interface spec\n"); - goto tap_cleanup; + dynamic = true; + iface = dynamic_ifname; + srand(getpid()); } result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); @@ -214,14 +221,30 @@ static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) result->remote_addr_size = 0; /* TAP */ + do { + if (dynamic) { + strcpy(iface, template); + for (i = 0; i < strlen(iface); i++) { + if (iface[i] == 'X') { + iface[i] = padchar[rand() % strlen(padchar)]; + } + } + } + fd = create_tap_fd(iface); + if ((fd < 0) && (!dynamic)) { + printk(UM_KERN_ERR "uml_tap: failed to create tun interface\n"); + goto tap_cleanup; + } + result->tx_fd = fd; + result->rx_fd = fd; + } while (fd < 0); - fd = create_tap_fd(iface); - if (fd < 0) { - printk(UM_KERN_ERR "uml_tap: failed to create tun interface\n"); - goto tap_cleanup; + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); } - result->tx_fd = fd; - result->rx_fd = fd; + return result; tap_cleanup: printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd); @@ -233,6 +256,7 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) { char *iface; struct vector_fds *result = NULL; + char *argv[] = {NULL, NULL, NULL, NULL}; iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); if (iface == NULL) { @@ -266,6 +290,12 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) "uml_tap: failed to create paired raw socket: %i\n", result->rx_fd); goto hybrid_cleanup; } + + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } return result; hybrid_cleanup: printk(UM_KERN_ERR "user_init_hybrid: init failed"); @@ -332,7 +362,7 @@ static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id) } switch (id) { case ID_BESS: - if (connect(fd, remote_addr, sizeof(struct sockaddr_un)) < 0) { + if (connect(fd, (const struct sockaddr *) remote_addr, sizeof(struct sockaddr_un)) < 0) { printk(UM_KERN_ERR "bess open:cannot connect to %s %i", remote_addr->sun_path, -errno); goto unix_cleanup; } @@ -399,8 +429,7 @@ static struct vector_fds *user_init_fd_fds(struct arglist *ifspec) fd_cleanup: if (fd >= 0) os_close_file(fd); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } @@ -410,6 +439,7 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) int err = -ENOMEM; char *iface; struct vector_fds *result = NULL; + char *argv[] = {NULL, NULL, NULL, NULL}; iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); if (iface == NULL) @@ -432,6 +462,11 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) result->remote_addr = NULL; result->remote_addr_size = 0; } + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } return result; raw_cleanup: printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); @@ -789,10 +824,12 @@ void *uml_vector_user_bpf(char *filename) return false; } bpf_prog = uml_kmalloc(sizeof(struct sock_fprog), UM_GFP_KERNEL); - if (bpf_prog != NULL) { - bpf_prog->len = statbuf.st_size / sizeof(struct sock_filter); - bpf_prog->filter = NULL; + if (bpf_prog == NULL) { + printk(KERN_ERR "Failed to allocate bpf prog buffer"); + return NULL; } + bpf_prog->len = statbuf.st_size / sizeof(struct sock_filter); + bpf_prog->filter = NULL; ffd = os_open_file(filename, of_read(OPENFLAGS()), 0); if (ffd < 0) { printk(KERN_ERR "Error %d opening bpf file", -errno); diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 8d435f8a6dec..1c63b260ecc4 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += mmiowb.h +generic-y += module.lds.h generic-y += param.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 26b5e243d3fc..3bed09538dd9 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -101,7 +101,7 @@ void interrupt_end(void) schedule(); if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index 10c99e058fca..d1cffc2a7f21 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -35,14 +35,14 @@ int write_sigio_irq(int fd) } /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ -static DEFINE_SPINLOCK(sigio_spinlock); +static DEFINE_MUTEX(sigio_mutex); void sigio_lock(void) { - spin_lock(&sigio_spinlock); + mutex_lock(&sigio_mutex); } void sigio_unlock(void) { - spin_unlock(&sigio_spinlock); + mutex_unlock(&sigio_mutex); } diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index acbc879d2773..7452f70d50d0 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -47,12 +47,10 @@ void show_stack(struct task_struct *task, unsigned long *stack, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("%s\n", loglvl); + pr_cont("\n"); pr_cont(" %08lx", *stack++); } - printk("%s\n", loglvl); printk("%sCall Trace:\n", loglvl); dump_trace(current, &stackops, (void *)loglvl); - printk("%s\n", loglvl); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 25eaa6a0c658..3d109ff3309b 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -70,13 +70,17 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, * read of the message and write of the ACK. */ if (mode != TTMH_READ) { + bool disabled = irqs_disabled(); + + BUG_ON(mode == TTMH_IDLE && !disabled); + + if (disabled) + local_irq_enable(); while (os_poll(1, &time_travel_ext_fd) != 0) { - if (mode == TTMH_IDLE) { - BUG_ON(!irqs_disabled()); - local_irq_enable(); - local_irq_disable(); - } + /* nothing */ } + if (disabled) + local_irq_disable(); } ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); @@ -102,6 +106,7 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, break; } + resp.seq = msg->seq; os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); } diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index 9e16078a4bf8..1d7558dac75f 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -97,7 +97,7 @@ static int remove_files_and_dir(char *dir) while ((ent = readdir(directory)) != NULL) { if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) continue; - len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1; + len = strlen(dir) + strlen("/") + strlen(ent->d_name) + 1; if (len > sizeof(file)) { ret = -E2BIG; goto out; @@ -135,7 +135,7 @@ out: */ static inline int is_umdir_used(char *dir) { - char pid[sizeof("nnnnn\0")], *end, *file; + char pid[sizeof("nnnnnnnnn")], *end, *file; int dead, fd, p, n, err; size_t filelen; @@ -217,10 +217,10 @@ static int umdir_take_if_dead(char *dir) static void __init create_pid_file(void) { - char pid[sizeof("nnnnn\0")], *file; + char pid[sizeof("nnnnnnnnn")], *file; int fd, n; - n = strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"); + n = strlen(uml_dir) + UMID_LEN + sizeof("/pid"); file = malloc(n); if (!file) return; diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index ecf2f390fad2..07327425d06e 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -10,7 +10,7 @@ #include <signal.h> #include <string.h> #include <termios.h> -#include <wait.h> +#include <sys/wait.h> #include <sys/mman.h> #include <sys/utsname.h> #include <init.h> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9b6931f8d555..0d0667a9fbd7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -444,3 +444,4 @@ 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 +440 i386 process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 347809649ba2..1f47e24fb65c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,7 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 215376d975a2..21243747965d 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -9,8 +9,6 @@ ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE include $(srctree)/lib/vdso/Makefile -KBUILD_CFLAGS += $(DISABLE_LTO) - # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n UBSAN_SANITIZE := n @@ -176,7 +174,7 @@ quiet_cmd_vdso = VDSO $@ -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -shared --hash-style=both --build-id \ +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \ $(call ld-option, --eh-frame-hdr) -Bsymbolic GCOV_PROFILE := n diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index ca8a657edf59..a09fc37ead9d 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -239,7 +239,6 @@ beyond_if: (regs)->ss = __USER32_DS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - set_fs(USER_DS); return 0; } diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index fed67eafcacc..bb1654fe0ce7 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -8,10 +8,8 @@ */ #include <linux/scatterlist.h> -#include <linux/dma-debug.h> #include <asm/io.h> #include <asm/swiotlb.h> -#include <linux/dma-contiguous.h> extern int iommu_merge; extern int panic_on_overflow; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5303dbc5c9bc..d44858b69353 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -80,13 +80,14 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) -#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) +#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) +#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -132,7 +133,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 80 +#define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 @@ -636,7 +637,7 @@ struct kvm_vcpu_arch { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + struct kvm_cpuid_entry2 *cpuid_entries; int maxphyaddr; int max_tdp_level; @@ -788,6 +789,21 @@ struct kvm_vcpu_arch { /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; + + /* pv related cpuid info */ + struct { + /* + * value of the eax register in the KVM_CPUID_FEATURES CPUID + * leaf. + */ + u32 features; + + /* + * indicates whether pv emulation should be disabled if features + * are not present in the guest's cpuid + */ + bool enforce; + } pv_cpuid; }; struct kvm_lpage_info { @@ -860,6 +876,13 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -961,8 +984,31 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ + u32 user_space_msr_mask; + + struct { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; + } msr_filter; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; + + /* + * Whether the TDP MMU is enabled for this VM. This contains a + * snapshot of the TDP MMU module parameter from when the VM was + * created and remains unchanged for the life of the VM. If this is + * true, TDP MMU handler functions will run for various MMU + * operations. + */ + bool tdp_mmu_enabled; + + /* List of struct tdp_mmu_pages being used as roots */ + struct list_head tdp_mmu_roots; + /* List of struct tdp_mmu_pages not being used as roots */ + struct list_head tdp_mmu_pages; }; struct kvm_vm_stat { @@ -1069,7 +1115,7 @@ struct kvm_x86_ops { void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1143,7 +1189,12 @@ struct kvm_x86_ops { /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + /* + * Retrieve somewhat arbitrary exit information. Intended to be used + * only from within tracepoints to avoid VMREADs when tracing is off. + */ + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, @@ -1221,12 +1272,13 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); + void (*msr_filter_changed)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { @@ -1238,7 +1290,7 @@ struct kvm_x86_nested_ops { int (*set_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, @@ -1612,8 +1664,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -void kvm_define_shared_msr(unsigned index, u32 msr); -int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 4f77b8f22e54..ffc289992d1b 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -54,6 +54,7 @@ typedef int (*hyperv_fill_flush_list_func)( #define hv_enable_vdso_clocksource() \ vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); #define hv_get_raw_timer() rdtsc_ordered() +#define hv_get_vector() HYPERVISOR_CALLBACK_VECTOR /* * Reference to pv_ops must be inline so objtool diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 86f20d520a07..0b4920a7238e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -60,22 +60,20 @@ struct saved_msrs { #define EAX_EDX_RET(val, low, high) "=A" (val) #endif -#ifdef CONFIG_TRACEPOINTS /* * Be very careful with includes. This header is prone to include loops. */ #include <asm/atomic.h> #include <linux/tracepoint-defs.h> -extern struct tracepoint __tracepoint_read_msr; -extern struct tracepoint __tracepoint_write_msr; -extern struct tracepoint __tracepoint_rdpmc; -#define msr_tracepoint_active(t) static_key_false(&(t).key) +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(read_msr); +DECLARE_TRACEPOINT(write_msr); +DECLARE_TRACEPOINT(rdpmc); extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else -#define msr_tracepoint_active(t) false static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} @@ -128,7 +126,7 @@ static inline unsigned long long native_read_msr(unsigned int msr) val = __rdmsr(msr); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, val, 0); return val; @@ -150,7 +148,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) : "c" (msr), [fault] "i" (-EIO)); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); return EAX_EDX_VAL(val, low, high); } @@ -161,7 +159,7 @@ native_write_msr(unsigned int msr, u32 low, u32 high) { __wrmsr(msr, low, high); - if (msr_tracepoint_active(__tracepoint_write_msr)) + if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } @@ -181,7 +179,7 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high) : "c" (msr), "0" (low), "d" (high), [fault] "i" (-EIO) : "memory"); - if (msr_tracepoint_active(__tracepoint_write_msr)) + if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } @@ -248,7 +246,7 @@ static inline unsigned long long native_read_pmc(int counter) DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); - if (msr_tracepoint_active(__tracepoint_rdpmc)) + if (tracepoint_enabled(rdpmc)) do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); } diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 86651e86289d..cb9ad6b73973 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,19 +314,19 @@ static inline void mds_idle_clear_cpu_buffers(void) * lfence * jmp spec_trap * do_rop: - * mov %rax,(%rsp) for x86_64 + * mov %rcx,(%rsp) for x86_64 * mov %edx,(%esp) for x86_32 * retq * * Without retpolines configured: * - * jmp *%rax for x86_64 + * jmp *%rcx for x86_64 * jmp *%edx for x86_32 */ #ifdef CONFIG_RETPOLINE # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 17 -# define RETPOLINE_RAX_BPF_JIT() \ +# define RETPOLINE_RCX_BPF_JIT_SIZE 17 +# define RETPOLINE_RCX_BPF_JIT() \ do { \ EMIT1_off32(0xE8, 7); /* callq do_rop */ \ /* spec_trap: */ \ @@ -334,7 +334,7 @@ do { \ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ + EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ EMIT1(0xC3); /* retq */ \ } while (0) # else /* !CONFIG_X86_64 */ @@ -352,9 +352,9 @@ do { \ # endif #else /* !CONFIG_RETPOLINE */ # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 2 -# define RETPOLINE_RAX_BPF_JIT() \ - EMIT2(0xFF, 0xE0); /* jmp *%rax */ +# define RETPOLINE_RCX_BPF_JIT_SIZE 2 +# define RETPOLINE_RCX_BPF_JIT() \ + EMIT2(0xFF, 0xE1); /* jmp *%rcx */ # else /* !CONFIG_X86_64 */ # define RETPOLINE_EDX_BPF_JIT() \ EMIT2(0xFF, 0xE2) /* jmp *%edx */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 565ad755c785..f462895a33e4 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -42,6 +42,17 @@ #endif /* CONFIG_X86_PAE */ /* + * User space process size: 3GB (default). + */ +#define IA32_PAGE_OFFSET __PAGE_OFFSET +#define TASK_SIZE __PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE +#define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +/* * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) */ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d0c6c10c18a0..3f49dac03617 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -60,6 +60,44 @@ #endif /* + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] + */ +#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) +#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) + +#define STACK_TOP TASK_SIZE_LOW +#define STACK_TOP_MAX TASK_SIZE_MAX + +/* * Maximum kernel image size is limited to 1 GiB, due to the fixmap living * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). * diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ac507586769..82a08b585818 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -482,10 +482,6 @@ extern unsigned int fpu_user_xstate_size; struct perf_event; -typedef struct { - unsigned long seg; -} mm_segment_t; - struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -538,8 +534,6 @@ struct thread_struct { */ unsigned long iopl_emul; - mm_segment_t addr_limit; - unsigned int sig_on_uaccess_err:1; /* Floating point and extended processor state */ @@ -783,67 +777,15 @@ static inline void spin_lock_prefetch(const void *x) }) #ifdef CONFIG_X86_32 -/* - * User space process size: 3GB (default). - */ -#define IA32_PAGE_OFFSET PAGE_OFFSET -#define TASK_SIZE PAGE_OFFSET -#define TASK_SIZE_LOW TASK_SIZE -#define TASK_SIZE_MAX TASK_SIZE -#define DEFAULT_MAP_WINDOW TASK_SIZE -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ - .addr_limit = KERNEL_DS, \ } #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -/* - * User space process size. This is the first address outside the user range. - * There are a few constraints that determine this: - * - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical - * address, then that syscall will enter the kernel with a - * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable - * from being mapped at the maximum canonical address. - * - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the - * CPUs malfunction if they execute code from the highest canonical page. - * They'll speculate right off the end of the canonical space, and - * bad things happen. This is worked around in the same way as the - * Intel problem. - * - * With page table isolation enabled, we map the LDT in ... [stay tuned] - */ -#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) - -#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ - 0xc0000000 : 0xFFFFe000) - -#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ - IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) -#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) - -#define STACK_TOP TASK_SIZE_LOW -#define STACK_TOP_MAX TASK_SIZE_MAX - -#define INIT_THREAD { \ - .addr_limit = KERNEL_DS, \ -} +#define INIT_THREAD { } extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index cf13f9e78585..71d630bb5e08 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -3,10 +3,54 @@ #define __SVM_H #include <uapi/asm/svm.h> - +#include <uapi/asm/kvm.h> + +/* + * 32-bit intercept words in the VMCB Control Area, starting + * at Byte offset 000h. + */ + +enum intercept_words { + INTERCEPT_CR = 0, + INTERCEPT_DR, + INTERCEPT_EXCEPTION, + INTERCEPT_WORD3, + INTERCEPT_WORD4, + INTERCEPT_WORD5, + MAX_INTERCEPT, +}; enum { - INTERCEPT_INTR, + /* Byte offset 000h (word 0) */ + INTERCEPT_CR0_READ = 0, + INTERCEPT_CR3_READ = 3, + INTERCEPT_CR4_READ = 4, + INTERCEPT_CR8_READ = 8, + INTERCEPT_CR0_WRITE = 16, + INTERCEPT_CR3_WRITE = 16 + 3, + INTERCEPT_CR4_WRITE = 16 + 4, + INTERCEPT_CR8_WRITE = 16 + 8, + /* Byte offset 004h (word 1) */ + INTERCEPT_DR0_READ = 32, + INTERCEPT_DR1_READ, + INTERCEPT_DR2_READ, + INTERCEPT_DR3_READ, + INTERCEPT_DR4_READ, + INTERCEPT_DR5_READ, + INTERCEPT_DR6_READ, + INTERCEPT_DR7_READ, + INTERCEPT_DR0_WRITE = 48, + INTERCEPT_DR1_WRITE, + INTERCEPT_DR2_WRITE, + INTERCEPT_DR3_WRITE, + INTERCEPT_DR4_WRITE, + INTERCEPT_DR5_WRITE, + INTERCEPT_DR6_WRITE, + INTERCEPT_DR7_WRITE, + /* Byte offset 008h (word 2) */ + INTERCEPT_EXCEPTION_OFFSET = 64, + /* Byte offset 00Ch (word 3) */ + INTERCEPT_INTR = 96, INTERCEPT_NMI, INTERCEPT_SMI, INTERCEPT_INIT, @@ -38,7 +82,8 @@ enum { INTERCEPT_TASK_SWITCH, INTERCEPT_FERR_FREEZE, INTERCEPT_SHUTDOWN, - INTERCEPT_VMRUN, + /* Byte offset 010h (word 4) */ + INTERCEPT_VMRUN = 128, INTERCEPT_VMMCALL, INTERCEPT_VMLOAD, INTERCEPT_VMSAVE, @@ -53,15 +98,18 @@ enum { INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, INTERCEPT_RDPRU, + /* Byte offset 014h (word 5) */ + INTERCEPT_INVLPGB = 160, + INTERCEPT_INVLPGB_ILLEGAL, + INTERCEPT_INVPCID, + INTERCEPT_MCOMMIT, + INTERCEPT_TLBSYNC, }; struct __attribute__ ((__packed__)) vmcb_control_area { - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - u8 reserved_1[40]; + u32 intercepts[MAX_INTERCEPT]; + u32 reserved_1[15 - MAX_INTERCEPT]; u16 pause_filter_thresh; u16 pause_filter_count; u64 iopm_base_pa; @@ -287,32 +335,6 @@ struct vmcb { #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK #define SVM_SELECTOR_CODE_MASK (1 << 3) -#define INTERCEPT_CR0_READ 0 -#define INTERCEPT_CR3_READ 3 -#define INTERCEPT_CR4_READ 4 -#define INTERCEPT_CR8_READ 8 -#define INTERCEPT_CR0_WRITE (16 + 0) -#define INTERCEPT_CR3_WRITE (16 + 3) -#define INTERCEPT_CR4_WRITE (16 + 4) -#define INTERCEPT_CR8_WRITE (16 + 8) - -#define INTERCEPT_DR0_READ 0 -#define INTERCEPT_DR1_READ 1 -#define INTERCEPT_DR2_READ 2 -#define INTERCEPT_DR3_READ 3 -#define INTERCEPT_DR4_READ 4 -#define INTERCEPT_DR5_READ 5 -#define INTERCEPT_DR6_READ 6 -#define INTERCEPT_DR7_READ 7 -#define INTERCEPT_DR0_WRITE (16 + 0) -#define INTERCEPT_DR1_WRITE (16 + 1) -#define INTERCEPT_DR2_WRITE (16 + 2) -#define INTERCEPT_DR3_WRITE (16 + 3) -#define INTERCEPT_DR4_WRITE (16 + 4) -#define INTERCEPT_DR5_WRITE (16 + 5) -#define INTERCEPT_DR6_WRITE (16 + 6) -#define INTERCEPT_DR7_WRITE (16 + 7) - #define SVM_EVTINJ_VEC_MASK 0xff #define SVM_EVTINJ_TYPE_SHIFT 8 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 267701ae3d86..44733a4bfc42 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -102,7 +102,6 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ -#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -131,7 +130,6 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 477c503f2753..c9fa7be3df82 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -13,30 +13,6 @@ #include <asm/extable.h> /* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(-1UL) -#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) - -#define get_fs() (current->thread.addr_limit) -static inline void set_fs(mm_segment_t fs) -{ - current->thread.addr_limit = fs; - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); -} - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#define user_addr_max() (current->thread.addr_limit.seg) - -/* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ @@ -93,7 +69,7 @@ static inline bool pagefault_disabled(void); #define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ - likely(!__range_not_ok(addr, size, user_addr_max())); \ + likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ }) extern int __get_user_1(void); @@ -232,16 +208,24 @@ extern void __put_user_nocheck_2(void); extern void __put_user_nocheck_4(void); extern void __put_user_nocheck_8(void); +/* + * ptr must be evaluated and assigned to the temporary __ptr_pu before + * the assignment of x to __val_pu, to avoid any function calls + * involved in the ptr expression (possibly implicitly generated due + * to KASAN) from clobbering %ax. + */ #define do_put_user_call(fn,x,ptr) \ ({ \ int __ret_pu; \ + void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ __chk_user_ptr(ptr); \ + __ptr_pu = (ptr); \ __val_pu = (x); \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ - : "0" (ptr), \ + : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cd7de4b401fe..f8ba5289ecb0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -52,7 +52,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES) #define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT) #define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING) -#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP) +#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP) #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC) #define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID) #define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0780f97c1850..89e5f3d1bba8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,6 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { +#define KVM_MSR_FILTER_READ (1 << 0) +#define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index a7a3403645e5..f1d8307454e0 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -77,6 +77,7 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 @@ -182,6 +183,7 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e89031e9c847..9ac696487b13 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -32,6 +32,7 @@ #include <linux/gfp.h> #include <linux/atomic.h> #include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <asm/mtrr.h> #include <asm/proto.h> #include <asm/iommu.h> @@ -96,8 +97,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT); spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, @@ -468,7 +468,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, { void *vaddr; - vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs); + vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); if (!vaddr || !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) return vaddr; @@ -480,7 +480,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, goto out_free; return vaddr; out_free: - dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs); + dma_direct_free(dev, size, vaddr, *dma_addr, attrs); return NULL; } @@ -490,7 +490,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); - dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs); + dma_direct_free(dev, size, vaddr, dma_addr, attrs); } static int no_agp; @@ -678,6 +678,8 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 3ca07ad552ae..70b7154f4bdd 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -38,9 +38,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); - - BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); BLANK(); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 1c08cb9eb9f6..4102b866e7c0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1277,7 +1277,7 @@ static void queue_task_work(struct mce *m, int kill_it) else current->mce_kill_me.func = kill_me_maybe; - task_work_add(current, ¤t->mce_kill_me, true); + task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } /* diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 9834a43cd0fa..05ef1f4550cb 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -55,9 +55,14 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } -void hv_setup_vmbus_irq(void (*handler)(void)) +int hv_setup_vmbus_irq(int irq, void (*handler)(void)) { + /* + * The 'irq' argument is ignored on x86/x64 because a hard-coded + * interrupt vector is used for Hyper-V interrupts. + */ vmbus_handler = handler; + return 0; } void hv_remove_vmbus_irq(void) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index b494187632b2..af323e2e3100 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -561,7 +561,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * callback has been invoked. */ atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, true); + ret = task_work_add(tsk, &callback->work, TWA_RESUME); if (ret) { /* * Task is exiting. Drop the refcount and free the callback. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1c0f2560a41c..7f57ede3cb8e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -975,7 +975,7 @@ void arch_haltpoll_disable(unsigned int cpu) if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) return; - /* Enable guest halt poll disables host halt poll */ + /* Disable guest halt poll enables host halt poll */ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_disable); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5dcedad21dff..de234e7a8962 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/dma-map-ops.h> #include <linux/dma-direct.h> -#include <linux/dma-debug.h> #include <linux/iommu.h> #include <linux/dmar.h> #include <linux/export.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b16caee53bea..84f581c91db4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -7,6 +7,7 @@ */ #include <linux/console.h> #include <linux/crash_dump.h> +#include <linux/dma-map-ops.h> #include <linux/dmi.h> #include <linux/efi.h> #include <linux/init_ohci1394_dma.h> @@ -20,6 +21,7 @@ #include <linux/tboot.h> #include <linux/usb/xhci-dbgp.h> #include <linux/static_call.h> +#include <linux/swiotlb.h> #include <uapi/linux/mount.h> diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index 720cde885042..6cf65397d225 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -251,6 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, .tls = tls_val, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbd5bd7a945a..f92dfd8ef10d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -66,6 +66,7 @@ config KVM_WERROR default y if X86_64 && !KASAN # We use the dependency on !COMPILE_TEST to not be enabled # blindly in allmodconfig or allyesconfig configurations + depends on KVM depends on (X86_64 && !KASAN) || !COMPILE_TEST depends on EXPERT help diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4a3081e9f4b5..b804444e16d4 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,9 +15,11 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ + mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o -kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ + vmx/evmcs.o vmx/nested.o vmx/posted_intr.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7456f9ad424b..06a278b3701d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,7 +54,24 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -static int kvm_check_cpuid(struct kvm_vcpu *vcpu) +static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *e; + int i; + + for (i = 0; i < nent; i++) { + e = &entries[i]; + + if (e->function == function && (e->index == index || + !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) + return e; + } + + return NULL; +} + +static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; @@ -62,7 +79,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, 0); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -107,6 +124,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) @@ -121,8 +145,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); - best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) @@ -146,7 +168,9 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - kvm_x86_ops.update_exception_bitmap(vcpu); + + /* Invoke the vendor callback only after the above state is updated. */ + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); } static int is_efer_nx(void) @@ -186,7 +210,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } -EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, @@ -194,46 +217,53 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry __user *entries) { int r, i; - struct kvm_cpuid_entry *cpuid_entries = NULL; + struct kvm_cpuid_entry *e = NULL; + struct kvm_cpuid_entry2 *e2 = NULL; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; + return -E2BIG; + if (cpuid->nent) { - cpuid_entries = vmemdup_user(entries, - array_size(sizeof(struct kvm_cpuid_entry), - cpuid->nent)); - if (IS_ERR(cpuid_entries)) { - r = PTR_ERR(cpuid_entries); - goto out; + e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); + if (IS_ERR(e)) + return PTR_ERR(e); + + e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); + if (!e2) { + r = -ENOMEM; + goto out_free_cpuid; } } for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; + e2[i].function = e[i].function; + e2[i].eax = e[i].eax; + e2[i].ebx = e[i].ebx; + e2[i].ecx = e[i].ecx; + e2[i].edx = e[i].edx; + e2[i].index = 0; + e2[i].flags = 0; + e2[i].padding[0] = 0; + e2[i].padding[1] = 0; + e2[i].padding[2] = 0; } - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - kvfree(cpuid_entries); - goto out; + kvfree(e2); + goto out_free_cpuid; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); - kvfree(cpuid_entries); -out: +out_free_cpuid: + kvfree(e); + return r; } @@ -241,26 +271,32 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { + struct kvm_cpuid_entry2 *e2 = NULL; int r; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + return -E2BIG; + + if (cpuid->nent) { + e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); + if (IS_ERR(e2)) + return PTR_ERR(e2); + } + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - goto out; + kvfree(e2); + return r; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); -out: - return r; + + return 0; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -941,17 +977,8 @@ out_free: struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { - struct kvm_cpuid_entry2 *e; - int i; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - - if (e->function == function && (e->index == index || - !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) - return e; - } - return NULL; + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 3a923ae15f2f..bf8577947ed2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include "x86.h" #include <asm/cpu.h> #include <asm/processor.h> +#include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); @@ -34,6 +35,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + struct cpuid_reg { u32 function; u32 index; @@ -308,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); } +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, + unsigned int kvm_feature) +{ + if (!vcpu->arch.pv_cpuid.enforce) + return true; + + return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2f6510de6b0c..0d917eb70319 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3606,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) u64 tsc_aux = 0; if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) - return emulate_gp(ctxt, 0); + return emulate_ud(ctxt); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } @@ -3701,21 +3701,35 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) + r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r > 0) return emulate_gp(ctxt, 0); - return X86EMUL_CONTINUE; + return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; + + r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; - if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) + if (r) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 8c1e8334eff0..5c7c4060b45c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); @@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cfe83d4ae625..a889563ad02d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 35cca2e0c802..105e7859d1f2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) +{ + kvm_lapic_set_reg(apic, APIC_DFR, val); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); +} + static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); @@ -488,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) +{ + apic_clear_irr(vec, vcpu->arch.apic); +} +EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); + static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { struct kvm_vcpu *vcpu; @@ -1576,9 +1588,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1593,7 +1602,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { - if (lapic_timer_int_injected(vcpu)) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns && + lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1629,14 +1641,15 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - if (apic->lapic_timer.timer_advance_ns) - __kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } atomic_inc(&apic->lapic_timer.pending); - kvm_set_pending_timer(vcpu); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + if (from_timer_fn) + kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -1984,10 +1997,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) { - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); - } else + if (!apic_x2apic_mode(apic)) + kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); + else ret = 1; break; @@ -2183,8 +2195,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || - !apic_lvtt_tscdeadline(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -2194,8 +2205,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); @@ -2303,7 +2313,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU); + kvm_apic_set_dfr(apic, 0xffffffffU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) @@ -2461,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } +EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 754f29beb83e..4fb86e3a9dd3 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5efc6081ca13..9c4a9c8e43d9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); -} - /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 71aa3da2a0b7..17587f496ec7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,10 +19,12 @@ #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" +#include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "cpuid.h" +#include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -45,7 +47,6 @@ #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> -#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -64,12 +65,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); -static struct kernel_param_ops nx_huge_pages_ops = { +static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { +static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { .set = set_nx_huge_pages_recovery_ratio, .get = param_get_uint, }; @@ -104,45 +105,13 @@ enum { AUDIT_POST_SYNC }; -#undef MMU_DEBUG - #ifdef MMU_DEBUG -static bool dbg = 0; +bool dbg = 0; module_param(dbg, bool, 0644); - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) -#else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 - -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ @@ -156,18 +125,6 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) -#else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) @@ -175,42 +132,11 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - #include <trace/events/kvm.h> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault and mmu.page_fault: - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * - * For handle_mmio_page_fault only: - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE = 1, - RET_PF_INVALID = 2, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -242,65 +168,10 @@ struct kvm_shadow_walk_iterator { __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; +struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_value; -static u64 __read_mostly shadow_mmio_access_mask; -static u64 __read_mostly shadow_present_mask; -static u64 __read_mostly shadow_me_mask; - -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order - * to guard against L1TF attacks. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; - -/* - * In some cases, we need to preserve the GFN of a non-present or reserved - * SPTE when we usurp the upper five bits of the physical address space to - * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll - * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask - * left into the reserved bits, i.e. the GFN in the SPTE will be split into - * high and low parts. This mask covers the lower bits of the GFN. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; - -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -static u8 __read_mostly shadow_phys_bits; - static void mmu_spte_set(u64 *sptep, u64 spte); -static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -325,7 +196,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages) { struct kvm_tlb_range range; @@ -336,143 +207,17 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) -{ - BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); - WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_access_mask = access_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static bool is_mmio_spte(u64 spte) -{ - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; -} - -static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) -{ - return sp->role.ad_disabled; -} - -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) -{ - /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. - */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - -static inline bool spte_ad_enabled(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; -} - -static inline bool spte_ad_need_write_protect(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; -} - -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - -/* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of - * the memslots generation and is derived as follows: - * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 - * - * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in - * the MMIO generation number, as doing so would require stealing a bit from - * the "real" generation number and thus effectively halve the maximum number - * of MMIO generations that can be handled before encountering a wrap (which - * requires a full MMU zap). The flag is instead explicitly queried when - * checking for MMIO spte cache hits. - */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) - -#define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ - MMIO_SPTE_GEN_LOW_START) - -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT -#define MMIO_SPTE_GEN_HIGH_END 62 -#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ - MMIO_SPTE_GEN_HIGH_START) - -static u64 generation_mmio_spte_mask(u64 gen) -{ - u64 mask; - - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); - - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; - mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; - return mask; -} - -static u64 get_mmio_spte_generation(u64 spte) -{ - u64 gen; - - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; - gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen; -} - -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) -{ - - u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); - u64 gpa = gfn << PAGE_SHIFT; - - access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; - - return mask; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 mask = make_mmio_spte(vcpu, gfn, access); - unsigned int gen = get_mmio_spte_generation(mask); - - access = mask & ACC_ALL; - trace_mark_mmio_spte(sptep, gfn, access, gen); + trace_mark_mmio_spte(sptep, gfn, mask); mmu_spte_set(sptep, mask); } @@ -521,7 +266,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { exception->error_code |= PFERR_RSVD_MASK; return UNMAPPED_GVA; } @@ -529,90 +274,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) -{ - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - -static void kvm_mmu_reset_all_pte_masks(void) -{ - u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - - shadow_phys_bits = kvm_get_shadow_phys_bits(); - - /* - * If the CPU has 46 or less physical address bits, then set an - * appropriate mask to guard against L1TF attacks. Otherwise, it is - * assumed that the CPU is not vulnerable to L1TF. - * - * Some Intel CPUs address the L1 cache using more PA bits than are - * reported by CPUID. Use the PA width of the L1 cache when possible - * to achieve more effective mitigation, e.g. if system RAM overlaps - * the most significant bits of legal physical address space. - */ - shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_has_bug(X86_BUG_L1TF) && - !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { - low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; - shadow_nonpresent_or_rsvd_mask = - rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); - } - - shadow_nonpresent_or_rsvd_lower_gfn_mask = - GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); -} - static int is_cpuid_PSE36(void) { return 1; @@ -623,35 +284,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.efer & EFER_NX; } -static int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static bool is_executable_pte(u64 spte) -{ - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; -} - -static kvm_pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -796,12 +428,6 @@ retry: } #endif -static bool spte_can_locklessly_be_made_writable(u64 spte) -{ - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); -} - static bool spte_has_volatile_bits(u64 spte) { if (!is_shadow_present_pte(spte)) @@ -826,21 +452,6 @@ static bool spte_has_volatile_bits(u64 spte) return false; } -static bool is_accessed_spte(u64 spte) -{ - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -976,34 +587,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -static u64 mark_spte_for_access_track(u64 spte) -{ - if (spte_ad_enabled(spte)) - return spte & ~shadow_accessed_mask; - - if (is_access_track_spte(spte)) - return spte; - - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), - "kvm: Access Tracking saved bit locations are not zero\n"); - - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; - spte &= ~shadow_acc_track_mask; - - return spte; -} - /* Restore an acc-track PTE back to a regular PTE */ static u64 restore_acc_track_spte(u64 spte) { @@ -1193,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } -static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->lpage_disallowed) return; @@ -1221,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; @@ -1640,6 +1223,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1666,6 +1252,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1710,6 +1299,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } + if (kvm->arch.tdp_mmu_enabled) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + return write_protected; } @@ -1769,13 +1362,8 @@ restart: pte_list_remove(rmap_head, sptep); goto restart; } else { - new_spte = *sptep & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + *sptep, new_pfn); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1919,12 +1507,26 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + int r; + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; } int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + int r; + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1973,12 +1575,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; } #ifdef MMU_DEBUG @@ -2577,13 +2191,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); @@ -2615,8 +2223,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) +/* Returns the number of zapped non-leaf child shadow pages. */ +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; @@ -2630,23 +2239,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); - } - return true; - } - if (is_mmio_spte(pte)) + /* + * Recursively zap nested TDP SPs, parentless SPs are + * unlikely to be used again in the near future. This + * avoids retaining a large number of stale nested SPs. + */ + if (tdp_enabled && invalid_list && + child->role.guest_mode && !child->parent_ptes.val) + return kvm_mmu_prepare_zap_page(kvm, child, + invalid_list); + } + } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); - - return false; + } + return 0; } -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) +static int kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list) { + int zapped = 0; unsigned i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); + zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); + + return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2692,7 +2312,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); + *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ @@ -2885,8 +2505,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { struct kvm_mmu_page *sp; @@ -2946,132 +2566,42 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && - /* - * Some reserved pages, such as those from NVDIMM - * DAX devices, are not for MMIO, and can be mapped - * with cached memory type for better performance. - * However, the above check misconceives those pages - * as MMIO, and results in KVM mapping them with UC - * memory type, which would hurt the performance. - * Therefore, we check the host memory type in addition - * and only treat UC/UC-/WC pages as MMIO. - */ - (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - - return !e820__mapped_raw_any(pfn_to_hpa(pfn), - pfn_to_hpa(pfn + 1) - 1, - E820_TYPE_RAM); -} - -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) - static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte = 0; - int ret = 0; + u64 spte; struct kvm_mmu_page *sp; + int ret; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; sp = sptep_to_sp(sptep); - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; - - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ - spte |= shadow_present_mask; - if (!speculative) - spte |= spte_shadow_accessed_mask(spte); - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { - pte_access &= ~ACC_EXEC_MASK; - } + ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, + can_unsync, host_writable, sp_ad_disabled(sp), &spte); - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - - if (level > PG_LEVEL_4K) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; - - spte |= (u64)pfn << PAGE_SHIFT; - - if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); - } - } - - if (pte_access & ACC_WRITE_MASK) { + if (spte & PT_WRITABLE_MASK) kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= spte_shadow_dirty_mask(spte); - } - if (speculative) - spte = mark_spte_for_access_track(spte); - -set_pte: - if (mmu_spte_update(sptep, spte)) + if (*sptep == spte) + ret |= SET_SPTE_SPURIOUS; + else if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; return ret; } static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int write_fault, int level, + unsigned int pte_access, bool write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; int set_spte_ret; - int ret = RET_PF_RETRY; + int ret = RET_PF_FIXED; bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -3113,6 +2643,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; + /* + * The fault is fully spurious if and only if the new SPTE and old SPTE + * are identical, and emulation is not required. + */ + if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { + WARN_ON_ONCE(!was_rmapped); + return RET_PF_SPURIOUS; + } + pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) @@ -3161,7 +2700,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); put_page(pages[i]); } @@ -3239,8 +2778,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3248,6 +2788,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; + *req_level = PG_LEVEL_4K; + if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; @@ -3272,7 +2814,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (level == PG_LEVEL_4K) return level; - level = min(level, max_level); + *req_level = level = min(level, max_level); + + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ + if (huge_page_disallowed) + return PG_LEVEL_4K; /* * mmu_notifier_retry() was successful and mmu_lock is held, so @@ -3285,14 +2834,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, - gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp) { - int level = *levelp; - u64 spte = *it.sptep; + int level = *goal_levelp; - if (it.level == level && level > PG_LEVEL_4K && - is_nx_huge_page_enabled() && + if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -3302,26 +2849,32 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - + KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; - (*levelp)--; + (*goal_levelp)--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + bool prefault, bool is_tdp) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, ret; + int level, req_level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { @@ -3329,7 +2882,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &level); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gfn, it.level, + &pfn, &level); base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) @@ -3341,7 +2896,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (account_disallowed_nx_lpage) + if (is_tdp && huge_page_disallowed && + req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } @@ -3349,6 +2905,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, write, level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + direct_pte_prefetch(vcpu, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -3479,21 +3038,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Return value: - * - true: let the vcpu to access on the same address again. - * - false: let the real page fault path to fix it. + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool fault_handled = false; + int ret = RET_PF_INVALID; u64 spte = 0ull; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) - return false; + return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3519,7 +3076,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * they are always ACC_ALL. */ if (is_access_allowed(error_code, spte)) { - fault_handled = true; + ret = RET_PF_SPURIOUS; break; } @@ -3562,11 +3119,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte, - new_spte); - if (fault_handled) + if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, + new_spte)) { + ret = RET_PF_FIXED; break; + } if (++retry_count > 4) { printk_once(KERN_WARNING @@ -3577,10 +3134,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, fault_handled); + spte, ret); walk_shadow_page_lockless_end(vcpu); - return fault_handled; + return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -3592,9 +3149,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + + if (kvm_mmu_put_root(kvm, sp)) { + if (sp->tdp_mmu_page) + kvm_tdp_mmu_free_root(kvm, sp); + else if (sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3603,6 +3164,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free) { + struct kvm *kvm = vcpu->kvm; int i; LIST_HEAD(invalid_list); bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; @@ -3620,22 +3182,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&vcpu->kvm->mmu_lock); + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) - mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, - &invalid_list); + mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); } else { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, + mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->root_hpa = INVALID_PAGE; @@ -3643,8 +3204,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->root_pgd = 0; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3684,8 +3245,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (vcpu->kvm->arch.tdp_mmu_enabled) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + + if (!VALID_PAGE(root)) + return -ENOSPC; + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, + true); + if (!VALID_PAGE(root)) return -ENOSPC; vcpu->arch.mmu->root_hpa = root; @@ -3910,54 +3479,82 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -/* return true if reserved bit is detected on spte. */ -static bool -walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; - struct rsvd_bits_validate *rsvd_check; - int root, leaf; - bool reserved = false; + int leaf = vcpu->arch.mmu->root_level; + u64 spte; - rsvd_check = &vcpu->arch.mmu->shadow_zero_check; walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), - leaf = root = iterator.level; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf - 1] = spte; - leaf--; if (!is_shadow_present_pte(spte)) break; + } + + walk_shadow_page_lockless_end(vcpu); + + return leaf; +} + +/* return true if reserved bit is detected on spte. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL]; + struct rsvd_bits_validate *rsvd_check; + int root = vcpu->arch.mmu->root_level; + int leaf; + int level; + bool reserved = false; + + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { + *sptep = 0ull; + return reserved; + } + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + else + leaf = get_walk(vcpu, addr, sptes); + + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + + for (level = root; level >= leaf; level--) { + if (!is_shadow_present_pte(sptes[level - 1])) + break; /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, spte) | - __is_rsvd_bits_set(rsvd_check, spte, iterator.level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | + __is_rsvd_bits_set(rsvd_check, sptes[level - 1], + level); } - walk_shadow_page_lockless_end(vcpu); - if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); - while (root > leaf) { + for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[root - 1], root); - root--; - } + sptes[level - 1], level); } - *sptep = spte; + *sptep = sptes[leaf - 1]; + return reserved; } @@ -3969,7 +3566,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; - reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) return -EINVAL; @@ -4080,8 +3677,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool lpage_disallowed = exec && is_nx_huge_page_enabled(); bool map_writable; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -4092,16 +3687,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; + if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; + } r = mmu_topup_memory_caches(vcpu, false); if (r) return r; - if (lpage_disallowed) - max_level = PG_LEVEL_4K; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4118,8 +3713,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, - prefault, is_tdp && lpage_disallowed); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, + pfn, prefault); + else + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, + prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -4292,7 +3892,13 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); + /* + * If this is a direct root page, it doesn't have a write flooding + * count. Otherwise, clear the write flooding count. + */ + if (!new_role.direct) + __clear_sp_write_flooding_count( + to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -5400,7 +5006,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u32 base_role = vcpu->arch.mmu->mmu_role.base.word; entry = *spte; - mmu_page_zap_pte(vcpu->kvm, sp, spte); + mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && !((sp->role.word ^ base_role) & ~role_ign.word) && rmap_can_add(vcpu)) @@ -5450,13 +5056,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - WARN_ON(r == RET_PF_INVALID); + if (WARN_ON_ONCE(r == RET_PF_INVALID)) + return -EIO; } - if (r == RET_PF_RETRY) - return 1; if (r < 0) return r; + if (r != RET_PF_EMULATE) + return 1; /* * Before emulating the instruction, check if the error code @@ -5485,18 +5092,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: - /* - * On AMD platforms, under certain conditions insn_len may be zero on #NPF. - * This can happen if a guest gets a page-fault on data access but the HW - * table walker is not able to read the instruction page (e.g instruction - * page is not present in memory). In those cases we simply restart the - * guest, with the exception of AMD Erratum 1096 which is unrecoverable. - */ - if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) - return 1; - } - return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } @@ -5682,11 +5277,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu) free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; + mmu->root_hpa = INVALID_PAGE; + mmu->root_pgd = 0; + mmu->translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5712,7 +5313,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - uint i; int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; @@ -5726,25 +5326,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_pgd = 0; - vcpu->arch.root_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_pgd = 0; - vcpu->arch.guest_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; @@ -5841,6 +5429,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_reload_remote_mmus(kvm); kvm_zap_obsolete_pages(kvm); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -5860,6 +5452,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + kvm_mmu_init_tdp_mmu(kvm); + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); @@ -5870,6 +5464,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; kvm_page_track_unregister_notifier(kvm, node); + + kvm_mmu_uninit_tdp_mmu(kvm); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) @@ -5877,6 +5473,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; + bool flush; spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5896,6 +5493,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } + if (kvm->arch.tdp_mmu_enabled) { + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (flush) + kvm_flush_remote_tlbs(kvm); + } + spin_unlock(&kvm->mmu_lock); } @@ -5914,6 +5517,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); spin_unlock(&kvm->mmu_lock); /* @@ -5977,6 +5582,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); spin_unlock(&kvm->mmu_lock); } @@ -6002,6 +5610,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); spin_unlock(&kvm->mmu_lock); /* @@ -6023,6 +5633,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6037,6 +5649,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6062,6 +5676,10 @@ restart: } kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -6357,7 +5975,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; - while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + for ( ; to_zap; --to_zap) { + if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + break; + /* * We use a separate list instead of just using active_mmu_pages * because the number of lpage_disallowed pages is expected to @@ -6367,15 +5988,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); + if (sp->tdp_mmu_page) + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } - if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (to_zap) - cond_resched_lock(&kvm->mmu_lock); + cond_resched_lock(&kvm->mmu_lock); } } + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 3acf3b8eb469..bfc6389edc28 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -3,9 +3,23 @@ #define __KVM_X86_MMU_INTERNAL_H #include <linux/types.h> - +#include <linux/kvm_host.h> #include <asm/kvm_host.h> +#undef MMU_DEBUG + +#ifdef MMU_DEBUG +extern bool dbg; + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define MMU_WARN_ON(x) WARN_ON(x) +#else +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) +#define MMU_WARN_ON(x) do { } while (0) +#endif + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -41,8 +55,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; + + bool tdp_mmu_page; }; +extern struct kmem_cache *mmu_page_header_cache; + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -55,9 +73,77 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + +bool is_nx_huge_page_enabled(void); +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync); + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages); + +static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + BUG_ON(!sp->root_count); + lockdep_assert_held(&kvm->mmu_lock); + + ++sp->root_count; +} + +static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + lockdep_assert_held(&kvm->mmu_lock); + --sp->root_count; + + return !sp->root_count; +} + +/* + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, +}; + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level); +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp); + +bool is_nx_huge_page_enabled(void); + +void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); + +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 9d15bc0c535b..213699b27b44 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), + TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte), + TP_ARGS(sptep, gfn, spte), TP_STRUCT__entry( __field(void *, sptep) @@ -215,8 +215,8 @@ TRACE_EVENT( TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; + __entry->access = spte & ACC_ALL; + __entry->gen = get_mmio_spte_generation(spte); ), TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, @@ -244,14 +244,11 @@ TRACE_EVENT( __entry->access) ); -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - TRACE_EVENT( fast_page_fault, TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + u64 *sptep, u64 old_spte, int ret), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -260,7 +257,7 @@ TRACE_EVENT( __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) - __field(bool, retry) + __field(int, ret) ), TP_fast_assign( @@ -270,7 +267,7 @@ TRACE_EVENT( __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; - __entry->retry = retry; + __entry->ret = ret; ), TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" @@ -278,7 +275,7 @@ TRACE_EVENT( __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) + __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED ) ); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a84a141a2ad2..8443a675715b 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -229,7 +229,8 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, return; idx = srcu_read_lock(&head->track_srcu); - hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, + srcu_read_lock_held(&head->track_srcu)) if (n->track_write) n->track_write(vcpu, gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); @@ -254,7 +255,8 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) return; idx = srcu_read_lock(&head->track_srcu); - hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, + srcu_read_lock_held(&head->track_srcu)) if (n->track_flush_slot) n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4dd6b1e5b8cf..50e268eb8e1a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * emulate this operation, return 1 to indicate this case. */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, - int write_fault, int max_level, - kvm_pfn_t pfn, bool map_writable, bool prefault, - bool lpage_disallowed) + struct guest_walker *gw, u32 error_code, + int max_level, kvm_pfn_t pfn, bool map_writable, + bool prefault) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write_fault = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, ret; + int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -690,10 +694,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, + &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == hlevel) + if (it.level == level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -704,13 +710,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (huge_page_disallowed && req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, it.level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + FNAME(pte_prefetch)(vcpu, gw, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -738,7 +747,7 @@ out_gpte_changed: */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault, + struct guest_walker *walker, bool user_fault, bool *write_fault_to_shadow_pgtable) { int level; @@ -776,15 +785,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; + bool write_fault = error_code & PFERR_WRITE_MASK; + bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -825,7 +832,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (lpage_disallowed || is_self_change_mapping) + if (is_self_change_mapping) max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -869,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, - map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, + map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: @@ -895,6 +902,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + u64 old_spte; int level; u64 *sptep; @@ -917,7 +925,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sptep = iterator.sptep; sp = sptep_to_sp(sptep); - if (is_last_spte(*sptep, level)) { + old_spte = *sptep; + if (is_last_spte(old_spte, level)) { pt_element_t gpte; gpa_t pte_gpa; @@ -927,7 +936,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); + if (is_shadow_present_pte(old_spte)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c new file mode 100644 index 000000000000..d9c5665a55e9 --- /dev/null +++ b/arch/x86/kvm/mmu/spte.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * Macros and functions to access KVM PTEs (also known as SPTEs) + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2020 Red Hat, Inc. and/or its affiliates. + */ + + +#include <linux/kvm_host.h> +#include "mmu.h" +#include "mmu_internal.h" +#include "x86.h" +#include "spte.h" + +#include <asm/e820/api.h> + +u64 __read_mostly shadow_nx_mask; +u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_accessed_mask; +u64 __read_mostly shadow_dirty_mask; +u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_access_mask; +u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_me_mask; +u64 __read_mostly shadow_acc_track_mask; + +u64 __read_mostly shadow_nonpresent_or_rsvd_mask; +u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +u8 __read_mostly shadow_phys_bits; + +static u64 generation_mmio_spte_mask(u64 gen) +{ + u64 mask; + + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; + return mask; +} + +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) +{ + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; + u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; + + access &= shadow_mmio_access_mask; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; + + return mask; +} + +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); + + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); +} + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) +{ + u64 spte = 0; + int ret = 0; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; + + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + + if (level > PG_LEVEL_4K) + spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + + spte |= (u64)pfn << PAGE_SHIFT; + + if (pte_access & ACC_WRITE_MASK) { + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writable_pte(old_spte)) + goto out; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + pgprintk("%s: found shadow page for %llx, marking ro\n", + __func__, gfn); + ret |= SET_SPTE_WRITE_PROTECTED_PT; + pte_access &= ~ACC_WRITE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + } + } + + if (pte_access & ACC_WRITE_MASK) + spte |= spte_shadow_dirty_mask(spte); + + if (speculative) + spte = mark_spte_for_access_track(spte); + +out: + *new_spte = spte; + return ret; +} + +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +{ + u64 spte; + + spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else + spte |= shadow_accessed_mask; + + return spte; +} + +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) +{ + u64 new_spte; + + new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + + new_spte = mark_spte_for_access_track(new_spte); + + return new_spte; +} + +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + +u64 mark_spte_for_access_track(u64 spte) +{ + if (spte_ad_enabled(spte)) + return spte & ~shadow_accessed_mask; + + if (is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + + return spte; +} + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +{ + BUG_ON((u64)(unsigned)access_mask != access_mask); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + shadow_mmio_access_mask = access_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask, u64 me_mask) +{ + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); + + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; + shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + +void kvm_mmu_reset_all_pte_masks(void) +{ + u8 low_phys_bits; + + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; + + shadow_phys_bits = kvm_get_shadow_phys_bits(); + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. + */ + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } + + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); +} diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h new file mode 100644 index 000000000000..4ecf40e0b8fe --- /dev/null +++ b/arch/x86/kvm/mmu/spte.h @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_X86_MMU_SPTE_H +#define KVM_X86_MMU_SPTE_H + +#include "mmu_internal.h" + +#define PT_FIRST_AVAIL_BITS_SHIFT 10 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +/* + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: + * + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) + +extern u64 __read_mostly shadow_nx_mask; +extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_accessed_mask; +extern u64 __read_mostly shadow_dirty_mask; +extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_access_mask; +extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_me_mask; + +/* + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. + */ +extern u64 __read_mostly shadow_acc_track_mask; + +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +extern u8 __read_mostly shadow_phys_bits; + +static inline bool is_mmio_spte(u64 spte) +{ + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; +} + +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + +static inline bool is_access_track_spte(u64 spte) +{ + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; +} + +static inline int is_shadow_present_pte(u64 pte) +{ + return (pte != 0) && !is_mmio_spte(pte); +} + +static inline int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + +static inline int is_last_spte(u64 pte, int level) +{ + if (level == PG_LEVEL_4K) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + +static inline bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + +static inline kvm_pfn_t spte_to_pfn(u64 pte) +{ + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static inline bool is_accessed_spte(u64 spte) +{ + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); +} + +static inline bool is_dirty_spte(u64 spte) +{ + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; +} + +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); +} + +static inline u64 get_mmio_spte_generation(u64 spte) +{ + u64 gen; + + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen; +} + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte); +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); +u64 mark_spte_for_access_track(u64 spte); +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); + +void kvm_mmu_reset_all_pte_masks(void); + +#endif diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c new file mode 100644 index 000000000000..87b7e16911db --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu_internal.h" +#include "tdp_iter.h" +#include "spte.h" + +/* + * Recalculates the pointer to the SPTE for the current GFN and level and + * reread the SPTE. + */ +static void tdp_iter_refresh_sptep(struct tdp_iter *iter) +{ + iter->sptep = iter->pt_path[iter->level - 1] + + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + iter->old_spte = READ_ONCE(*iter->sptep); +} + +static gfn_t round_gfn_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + +/* + * Sets a TDP iterator to walk a pre-order traversal of the paging structure + * rooted at root_pt, starting with the walk to translate goal_gfn. + */ +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn) +{ + WARN_ON(root_level < 1); + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + + iter->goal_gfn = goal_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; + iter->pt_path[iter->level - 1] = root_pt; + + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* + * Given an SPTE and its level, returns a pointer containing the host virtual + * address of the child page table referenced by the SPTE. Returns null if + * there is no such entry. + */ +u64 *spte_to_child_pt(u64 spte, int level) +{ + /* + * There's no child entry if this entry isn't present or is a + * last-level entry. + */ + if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) + return NULL; + + return __va(spte_to_pfn(spte) << PAGE_SHIFT); +} + +/* + * Steps down one level in the paging structure towards the goal GFN. Returns + * true if the iterator was able to step down a level, false otherwise. + */ +static bool try_step_down(struct tdp_iter *iter) +{ + u64 *child_pt; + + if (iter->level == iter->min_level) + return false; + + /* + * Reread the SPTE before stepping down to avoid traversing into page + * tables that are no longer linked from this entry. + */ + iter->old_spte = READ_ONCE(*iter->sptep); + + child_pt = spte_to_child_pt(iter->old_spte, iter->level); + if (!child_pt) + return false; + + iter->level--; + iter->pt_path[iter->level - 1] = child_pt; + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Steps to the next entry in the current page table, at the current page table + * level. The next entry could point to a page backing guest memory or another + * page table, or it could be non-present. Returns true if the iterator was + * able to step to the next entry in the page table, false if the iterator was + * already at the end of the current page table. + */ +static bool try_step_side(struct tdp_iter *iter) +{ + /* + * Check if the iterator is already at the end of the current page + * table. + */ + if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (PT64_ENT_PER_PAGE - 1)) + return false; + + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); + iter->goal_gfn = iter->gfn; + iter->sptep++; + iter->old_spte = READ_ONCE(*iter->sptep); + + return true; +} + +/* + * Tries to traverse back up a level in the paging structure so that the walk + * can continue from the next entry in the parent page table. Returns true on a + * successful step up, false if already in the root page. + */ +static bool try_step_up(struct tdp_iter *iter) +{ + if (iter->level == iter->root_level) + return false; + + iter->level++; + iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Step to the next SPTE in a pre-order traversal of the paging structure. + * To get to the next SPTE, the iterator either steps down towards the goal + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a + * highter GFN. + * + * The basic algorithm is as follows: + * 1. If the current SPTE is a non-last-level SPTE, step down into the page + * table it points to. + * 2. If the iterator cannot step down, it will try to step to the next SPTE + * in the current page of the paging structure. + * 3. If the iterator cannot step to the next entry in the current page, it will + * try to step up to the parent paging structure page. In this case, that + * SPTE will have already been visited, and so the iterator must also step + * to the side again. + */ +void tdp_iter_next(struct tdp_iter *iter) +{ + if (try_step_down(iter)) + return; + + do { + if (try_step_side(iter)) + return; + } while (try_step_up(iter)); + iter->valid = false; +} + +/* + * Restart the walk over the paging structure from the root, starting from the + * highest gfn the iterator had previously reached. Assumes that the entire + * paging structure, except the root page, may have been completely torn down + * and rebuilt. + */ +void tdp_iter_refresh_walk(struct tdp_iter *iter) +{ + gfn_t goal_gfn = iter->goal_gfn; + + if (iter->gfn > goal_gfn) + goal_gfn = iter->gfn; + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], + iter->root_level, iter->min_level, goal_gfn); +} + +u64 *tdp_iter_root_pt(struct tdp_iter *iter) +{ + return iter->pt_path[iter->root_level - 1]; +} + diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h new file mode 100644 index 000000000000..47170d0dc98e --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_ITER_H +#define __KVM_X86_MMU_TDP_ITER_H + +#include <linux/kvm_host.h> + +#include "mmu.h" + +/* + * A TDP iterator performs a pre-order walk over a TDP paging structure. + */ +struct tdp_iter { + /* + * The iterator will traverse the paging structure towards the mapping + * for this GFN. + */ + gfn_t goal_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ + u64 *sptep; + /* The lowest GFN mapped by the current SPTE */ + gfn_t gfn; + /* The level of the root page given to the iterator */ + int root_level; + /* The lowest level the iterator should traverse to */ + int min_level; + /* The iterator's current level within the paging structure */ + int level; + /* A snapshot of the value at sptep */ + u64 old_spte; + /* + * Whether the iterator has a valid state. This will be false if the + * iterator walks off the end of the paging structure. + */ + bool valid; +}; + +/* + * Iterates over every SPTE mapping the GFN range [start, end) in a + * preorder traversal. + */ +#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ + for (tdp_iter_start(&iter, root, root_level, min_level, start); \ + iter.valid && iter.gfn < end; \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, root, root_level, start, end) \ + for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) + +u64 *spte_to_child_pt(u64 pte, int level); + +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn); +void tdp_iter_next(struct tdp_iter *iter); +void tdp_iter_refresh_walk(struct tdp_iter *iter); +u64 *tdp_iter_root_pt(struct tdp_iter *iter); + +#endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c new file mode 100644 index 000000000000..e246d71b8ea2 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -0,0 +1,1157 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu.h" +#include "mmu_internal.h" +#include "mmutrace.h" +#include "tdp_iter.h" +#include "tdp_mmu.h" +#include "spte.h" + +#ifdef CONFIG_X86_64 +static bool __read_mostly tdp_mmu_enabled = false; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +#endif + +static bool is_tdp_mmu_enabled(void) +{ +#ifdef CONFIG_X86_64 + return tdp_enabled && READ_ONCE(tdp_mmu_enabled); +#else + return false; +#endif /* CONFIG_X86_64 */ +} + +/* Initializes the TDP MMU for the VM, if enabled. */ +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +{ + if (!is_tdp_mmu_enabled()) + return; + + /* This should not be changed for the lifetime of the VM. */ + kvm->arch.tdp_mmu_enabled = true; + + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); +} + +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) +{ + if (!kvm->arch.tdp_mmu_enabled) + return; + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); +} + +#define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + sp = to_shadow_page(hpa); + + return sp->tdp_mmu_page && sp->root_count; +} + +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield); + +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + + lockdep_assert_held(&kvm->mmu_lock); + + WARN_ON(root->root_count); + WARN_ON(!root->tdp_mmu_page); + + list_del(&root->link); + + zap_gfn_range(kvm, root, 0, max_gfn, false); + + free_page((unsigned long)root->spt); + kmem_cache_free(mmu_page_header_cache, root); +} + +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, + int level) +{ + union kvm_mmu_page_role role; + + role = vcpu->arch.mmu->mmu_role.base; + role.level = level; + role.direct = true; + role.gpte_is_8_bytes = true; + role.access = ACC_ALL; + + return role; +} + +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + sp->role.word = page_role_for_level(vcpu, level).word; + sp->gfn = gfn; + sp->tdp_mmu_page = true; + + return sp; +} + +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *root; + + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + + spin_lock(&kvm->mmu_lock); + + /* Check for an existing root before allocating a new one. */ + for_each_tdp_mmu_root(kvm, root) { + if (root->role.word == role.word) { + kvm_mmu_get_root(kvm, root); + spin_unlock(&kvm->mmu_lock); + return root; + } + } + + root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root->root_count = 1; + + list_add(&root->link, &kvm->arch.tdp_mmu_roots); + + spin_unlock(&kvm->mmu_lock); + + return root; +} + +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root; + + root = get_tdp_mmu_vcpu_root(vcpu); + if (!root) + return INVALID_PAGE; + + return __pa(root->spt); +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level); + +static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + +static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) + return; + + if (is_accessed_spte(old_spte) && + (!is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); +} + +static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed; + struct kvm_memory_slot *slot; + + if (level > PG_LEVEL_4K) + return; + + pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if ((!is_writable_pte(old_spte) || pfn_changed) && + is_writable_pte(new_spte)) { + slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); + mark_page_dirty_in_slot(slot, gfn); + } +} + +/** + * handle_changed_spte - handle bookkeeping associated with an SPTE change + * @kvm: kvm instance + * @as_id: the address space of the paging structure the SPTE was a part of + * @gfn: the base GFN that was mapped by the SPTE + * @old_spte: The value of the SPTE before the change + * @new_spte: The value of the SPTE after the change + * @level: the level of the PT the SPTE is part of in the paging structure + * + * Handle bookkeeping that might result from the modification of a SPTE. + * This function must be called for all TDP SPTE modifications. + */ +static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + u64 *pt; + struct kvm_mmu_page *sp; + u64 old_child_spte; + int i; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); + WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level)); + + /* + * If this warning were to trigger it would indicate that there was a + * missing MMU notifier or a race with some notifier handler. + * A present, leaf SPTE should never be directly replaced with another + * present leaf SPTE pointing to a differnt PFN. A notifier handler + * should be zapping the SPTE before the main MM's page table is + * changed, or the SPTE should be zeroed, and the TLBs flushed by the + * thread before replacement. + */ + if (was_leaf && is_leaf && pfn_changed) { + pr_err("Invalid SPTE change: cannot replace a present leaf\n" + "SPTE with another present leaf SPTE mapping a\n" + "different PFN!\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + + /* + * Crash the host to prevent error propagation and guest data + * courruption. + */ + BUG(); + } + + if (old_spte == new_spte) + return; + + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ + * removed. In that case, there is nothing to do here. + */ + if (!was_present && !is_present) { + /* + * If this change does not involve a MMIO SPTE, it is + * unexpected. Log the change, though it should not impact the + * guest since both the former and current SPTEs are nonpresent. + */ + if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" + "should not be replaced with another,\n" + "different nonpresent SPTE, unless one or both\n" + "are MMIO SPTEs.\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + return; + } + + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_dirty_spte(new_spte) || pfn_changed)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + /* + * Recursively handle child PTs if the change removed a subtree from + * the paging structure. + */ + if (was_present && !was_leaf && (pfn_changed || !is_present)) { + pt = spte_to_child_pt(old_spte, level); + sp = sptep_to_sp(pt); + + list_del(&sp->link); + + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); + WRITE_ONCE(*(pt + i), 0); + handle_changed_spte(kvm, as_id, + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), + old_child_spte, 0, level - 1); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + free_page((unsigned long)pt); + kmem_cache_free(mmu_page_header_cache, sp); + } +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} + +static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + WRITE_ONCE(*iter->sptep, new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); + if (record_acc_track) + handle_changed_spte_acc_track(iter->old_spte, new_spte, + iter->level); + if (record_dirty_log) + handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + iter->old_spte, new_spte, + iter->level); +} + +static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); +} + +static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); +} + +static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); +} + +#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ + for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + +#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _root, _start, _end) \ + if (!is_shadow_present_pte(_iter.old_spte) || \ + !is_last_spte(_iter.old_spte, _iter.level)) \ + continue; \ + else + +#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + +/* + * Flush the TLB if the process should drop kvm->mmu_lock. + * Return whether the caller still needs to flush the tlb. + */ +static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + return false; + } else { + return true; + } +} + +static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + } +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + * If can_yield is true, will release the MMU lock and reschedule if the + * scheduler needs the CPU or there is contention on the MMU lock. If this + * function cannot yield, it will not release the MMU lock or reschedule and + * the caller must ensure it does not supply too large a GFN range, or the + * operation can cause a soft lockup. + */ +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield) +{ + struct tdp_iter iter; + bool flush_needed = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + /* + * If this is a non-last-level SPTE that covers a larger range + * than should be zapped, continue, and zap the mappings at a + * lower level. + */ + if ((iter.gfn < start || + iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + !is_last_spte(iter.old_spte, iter.level)) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + if (can_yield) + flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + else + flush_needed = true; + } + return flush_needed; +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + */ +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_mmu_page *root; + bool flush = false; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + flush |= zap_gfn_range(kvm, root, start, end, true); + + kvm_mmu_put_root(kvm, root); + } + + return flush; +} + +void kvm_tdp_mmu_zap_all(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + bool flush; + + flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Installs a last-level SPTE to handle a TDP page fault. + * (NPT/EPT violation/misconfiguration) + */ +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int map_writable, + struct tdp_iter *iter, + kvm_pfn_t pfn, bool prefault) +{ + u64 new_spte; + int ret = 0; + int make_spte_ret = 0; + + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + } else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; + else + tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + + /* + * If the page fault was caused by a write but the page is write + * protected, emulation is needed. If the emulation was skipped, + * the vCPU would have the same fault again. + */ + if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + if (write) + ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + if (!prefault) + vcpu->stat.pf_fixed++; + + return ret; +} + +/* + * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing + * page tables and SPTEs to translate the faulting guest physical address. + */ +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault) +{ + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct tdp_iter iter; + struct kvm_mmu_page *sp; + u64 *child_pt; + u64 new_spte; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + int level; + int req_level; + + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); + + trace_kvm_mmu_spte_requested(gpa, level, pfn); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, + iter.level, &pfn, &level); + + if (iter.level == level) + break; + + /* + * If there is an SPTE mapping a large page at a higher level + * than the target, that SPTE must be cleared and replaced + * with a non-leaf SPTE. + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { + tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); + + /* + * The iter must explicitly re-read the spte here + * because the new value informs the !present + * path below. + */ + iter.old_spte = READ_ONCE(*iter.sptep); + } + + if (!is_shadow_present_pte(iter.old_spte)) { + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; + clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + trace_kvm_mmu_get_page(sp, true); + if (huge_page_disallowed && req_level >= iter.level) + account_huge_nx_page(vcpu->kvm, sp); + + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } + + if (WARN_ON(iter.level != level)) + return RET_PF_RETRY; + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); + + return ret; +} + +static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end, unsigned long data, + int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long data)) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + struct kvm_mmu_page *root; + int ret = 0; + int as_id; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + as_id = kvm_mmu_page_as_id(root); + slots = __kvm_memslots(kvm, as_id); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, memslot, root, gfn_start, + gfn_end, data); + } + + kvm_mmu_put_root(kvm, root); + } + + return ret; +} + +static int zap_gfn_range_hva_wrapper(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long unused) +{ + return zap_gfn_range(kvm, root, start, end, false); +} + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + zap_gfn_range_hva_wrapper); +} + +/* + * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero + * if any of the GFNs in the range have been accessed. + */ +static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long unused) +{ + struct tdp_iter iter; + int young = 0; + u64 new_spte = 0; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * If we have a non-accessed entry we don't need to change the + * pte. + */ + if (!is_accessed_spte(iter.old_spte)) + continue; + + new_spte = iter.old_spte; + + if (spte_ad_enabled(new_spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)&new_spte); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + + new_spte = mark_spte_for_access_track(new_spte); + } + new_spte &= ~shadow_dirty_mask; + + tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); + young = 1; + } + + return young; +} + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + age_gfn_range); +} + +static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long unused2) +{ + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + if (is_accessed_spte(iter.old_spte)) + return 1; + + return 0; +} + +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, + test_age_gfn); +} + +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long data) +{ + struct tdp_iter iter; + pte_t *ptep = (pte_t *)data; + kvm_pfn_t new_pfn; + u64 new_spte; + int need_flush = 0; + + WARN_ON(pte_huge(*ptep)); + + new_pfn = pte_pfn(*ptep); + + tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { + if (iter.level != PG_LEVEL_4K) + continue; + + if (!is_shadow_present_pte(iter.old_spte)) + break; + + tdp_mmu_set_spte(kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + + if (!pte_write(*ptep)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + iter.old_spte, new_pfn); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + } + + need_flush = 1; + } + + if (need_flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + + return 0; +} + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, + (unsigned long)host_ptep, + set_tdp_spte); +} + +/* + * Remove write access from all the SPTEs mapping GFNs [start, end). If + * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, int min_level) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Remove write access from all the SPTEs mapping GFNs in the memslot. Will + * only affect leaf SPTEs down to min_level. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages, min_level); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn, unsigned long mask, bool wrprot) +{ + struct tdp_iter iter; + u64 new_spte; + + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + gfn + BITS_PER_LONG) { + if (!mask) + break; + + if (iter.level > PG_LEVEL_4K || + !(mask & (1UL << (iter.gfn - gfn)))) + continue; + + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + + mask &= ~(1UL << (iter.gfn - gfn)); + } +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); + } +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + new_spte = iter.old_spte | shadow_dirty_mask; + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + + return spte_set; +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + return spte_set; +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +static void zap_collapsible_spte_range(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + kvm_pfn_t pfn; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + is_last_spte(iter.old_spte, iter.level)) + continue; + + pfn = spte_to_pfn(iter.old_spte); + if (kvm_is_reserved_pfn(pfn) || + !PageTransCompoundMap(pfn_to_page(pfn))) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + + if (spte_set) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + zap_collapsible_spte_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + if (!is_writable_pte(iter.old_spte)) + break; + + new_spte = iter.old_spte & + ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + } + + return spte_set; +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + spte_set |= write_protect_gfn(kvm, root, gfn); + } + return spte_set; +} + +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + int leaf = vcpu->arch.mmu->shadow_root_level; + gfn_t gfn = addr >> PAGE_SHIFT; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + leaf = iter.level; + sptes[leaf - 1] = iter.old_spte; + } + + return leaf; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h new file mode 100644 index 000000000000..556e065503f6 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_MMU_H +#define __KVM_X86_MMU_TDP_MMU_H + +#include <linux/kvm_host.h> + +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); + +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_tdp_mmu_zap_all(struct kvm *kvm); + +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault); + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep); + +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level); +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + struct kvm_memory_slot *slot); +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot); +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); + +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +#endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ac830cd50830..8c550999ace0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT); + p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!p_page) goto free_avic; kvm_svm->avic_physical_id_table_page = p_page; - clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT); + l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!l_page) goto free_avic; kvm_svm->avic_logical_id_table_page = l_page; - clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: @@ -868,6 +866,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * - Tell IOMMU to use legacy mode for this interrupt. * - Retrieve ga_tag of prior interrupt remapping data. */ + pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 598a769f1961..9e4c226dbf7d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; + unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -108,42 +109,37 @@ void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested.ctl; - svm->nested.host_intercept_exceptions = h->intercept_exceptions; - - c->intercept_cr = h->intercept_cr; - c->intercept_dr = h->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions; - c->intercept = h->intercept; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] = h->intercepts[i]; if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ - c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); - c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + vmcb_clr_intercept(c, INTERCEPT_CR8_READ); + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); /* * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not * affect any interrupt we may want to inject; therefore, * interrupt window vmexits are irrelevant to L0. */ - c->intercept &= ~(1ULL << INTERCEPT_VINTR); + vmcb_clr_intercept(c, INTERCEPT_VINTR); } /* We don't want to see VMMCALLs from a nested guest */ - c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + vmcb_clr_intercept(c, INTERCEPT_VMMCALL); - c->intercept_cr |= g->intercept_cr; - c->intercept_dr |= g->intercept_dr; - c->intercept_exceptions |= g->intercept_exceptions; - c->intercept |= g->intercept; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] |= g->intercepts[i]; } static void copy_vmcb_control_area(struct vmcb_control_area *dst, struct vmcb_control_area *from) { - dst->intercept_cr = from->intercept_cr; - dst->intercept_dr = from->intercept_dr; - dst->intercept_exceptions = from->intercept_exceptions; - dst->intercept = from->intercept; + unsigned int i; + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; @@ -176,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -200,9 +196,23 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (!nested_svm_vmrun_msrpm(svm)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + + return true; +} + static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) return false; if (control->asid == 0) @@ -215,41 +225,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) { - bool nested_vmcb_lma; - if ((vmcb->save.efer & EFER_SVME) == 0) + bool vmcb12_lma; + + if ((vmcb12->save.efer & EFER_SVME) == 0) return false; - if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && - (vmcb->save.cr0 & X86_CR0_NW)) + if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) return false; - if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) return false; - nested_vmcb_lma = - (vmcb->save.efer & EFER_LME) && - (vmcb->save.cr0 & X86_CR0_PG); + vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); - if (!nested_vmcb_lma) { - if (vmcb->save.cr4 & X86_CR4_PAE) { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + if (!vmcb12_lma) { + if (vmcb12->save.cr4 & X86_CR4_PAE) { + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) return false; } else { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) return false; } } else { - if (!(vmcb->save.cr4 & X86_CR4_PAE) || - !(vmcb->save.cr0 & X86_CR0_PE) || - (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + if (!(vmcb12->save.cr4 & X86_CR4_PAE) || + !(vmcb12->save.cr0 & X86_CR0_PE) || + (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb->control); + return nested_vmcb_check_controls(&vmcb12->control); } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -296,7 +304,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * EXIT_INT_INFO. */ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *nested_vmcb) + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -308,7 +316,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, if (vcpu->arch.exception.has_error_code) { exit_int_info |= SVM_EVTINJ_VALID_ERR; - nested_vmcb->control.exit_int_info_err = + vmcb12->control.exit_int_info_err = vcpu->arch.exception.error_code; } @@ -325,7 +333,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, exit_int_info |= SVM_EVTINJ_TYPE_INTR; } - nested_vmcb->control.exit_int_info = exit_int_info; + vmcb12->control.exit_int_info = exit_int_info; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) @@ -364,31 +372,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { /* Load the nested guest state */ - svm->vmcb->save.es = nested_vmcb->save.es; - svm->vmcb->save.cs = nested_vmcb->save.cs; - svm->vmcb->save.ss = nested_vmcb->save.ss; - svm->vmcb->save.ds = nested_vmcb->save.ds; - svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; - svm->vmcb->save.idtr = nested_vmcb->save.idtr; - kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); - svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); - svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); - svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); - kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); - kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); + svm_set_efer(&svm->vcpu, vmcb12->save.efer); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); + svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); + kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); + kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = nested_vmcb->save.rax; - svm->vmcb->save.rsp = nested_vmcb->save.rsp; - svm->vmcb->save.rip = nested_vmcb->save.rip; - svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; - svm->vmcb->save.cpl = nested_vmcb->save.cpl; + svm->vmcb->save.rax = vmcb12->save.rax; + svm->vmcb->save.rsp = vmcb12->save.rsp; + svm->vmcb->save.rip = vmcb12->save.rip; + svm->vmcb->save.dr7 = vmcb12->save.dr7; + svm->vcpu.arch.dr6 = vmcb12->save.dr6; + svm->vmcb->save.cpl = vmcb12->save.cpl; } static void nested_prepare_vmcb_control(struct vcpu_svm *svm) @@ -426,17 +434,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb) +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, + struct vmcb *vmcb12) { int ret; - svm->nested.vmcb = vmcb_gpa; - load_nested_vmcb_control(svm, &nested_vmcb->control); - nested_prepare_vmcb_save(svm, nested_vmcb); + svm->nested.vmcb12_gpa = vmcb12_gpa; + load_nested_vmcb_control(svm, &vmcb12->control); + nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); - ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); if (ret) return ret; @@ -449,19 +457,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, int nested_svm_vmrun(struct vcpu_svm *svm) { int ret; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - u64 vmcb_gpa; + u64 vmcb12_gpa; if (is_smm(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } - vmcb_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + vmcb12_gpa = svm->vmcb->save.rax; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); return 1; @@ -471,26 +479,31 @@ int nested_svm_vmrun(struct vcpu_svm *svm) ret = kvm_skip_emulated_instruction(&svm->vcpu); - nested_vmcb = map.hva; + vmcb12 = map.hva; + + if (WARN_ON_ONCE(!svm->nested.initialized)) + return -EINVAL; - if (!nested_vmcb_checks(svm, nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; + if (!nested_vmcb_checks(svm, vmcb12)) { + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; goto out; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl); - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], + vmcb12->control.intercepts[INTERCEPT_WORD3], + vmcb12->control.intercepts[INTERCEPT_WORD4], + vmcb12->control.intercepts[INTERCEPT_WORD5]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -522,7 +535,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -563,23 +576,23 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { int rc; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); return 1; } - nested_vmcb = map.hva; + vmcb12 = map.hva; /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); /* in case we halted in L2 */ @@ -587,45 +600,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - nested_vmcb->save.es = vmcb->save.es; - nested_vmcb->save.cs = vmcb->save.cs; - nested_vmcb->save.ss = vmcb->save.ss; - nested_vmcb->save.ds = vmcb->save.ds; - nested_vmcb->save.gdtr = vmcb->save.gdtr; - nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); - nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu); - nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu); - nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu); - nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; - nested_vmcb->save.cpl = vmcb->save.cpl; - - nested_vmcb->control.int_state = vmcb->control.int_state; - nested_vmcb->control.exit_code = vmcb->control.exit_code; - nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; - nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; - nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - - if (nested_vmcb->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, nested_vmcb); + vmcb12->save.es = vmcb->save.es; + vmcb12->save.cs = vmcb->save.cs; + vmcb12->save.ss = vmcb->save.ss; + vmcb12->save.ds = vmcb->save.ds; + vmcb12->save.gdtr = vmcb->save.gdtr; + vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.efer = svm->vcpu.arch.efer; + vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); + vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr4 = svm->vcpu.arch.cr4; + vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); + vmcb12->save.rip = kvm_rip_read(&svm->vcpu); + vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); + vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr6 = svm->vcpu.arch.dr6; + vmcb12->save.cpl = vmcb->save.cpl; + + vmcb12->control.int_state = vmcb->control.int_state; + vmcb12->control.exit_code = vmcb->control.exit_code; + vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + + if (vmcb12->control.exit_code != SVM_EXIT_ERR) + nested_vmcb_save_pending_event(svm, vmcb12); if (svm->nrips_enabled) - nested_vmcb->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb->control.next_rip; - nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; - nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; - nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; - nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; + vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + vmcb12->control.event_inj = svm->nested.ctl.event_inj; + vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - nested_vmcb->control.pause_filter_count = + vmcb12->control.pause_filter_count = svm->vmcb->control.pause_filter_count; - nested_vmcb->control.pause_filter_thresh = + vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; /* Restore the original control entries */ @@ -659,11 +672,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); - trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, - nested_vmcb->control.exit_info_1, - nested_vmcb->control.exit_info_2, - nested_vmcb->control.exit_int_info, - nested_vmcb->control.exit_int_info_err, + trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, + vmcb12->control.exit_info_1, + vmcb12->control.exit_info_2, + vmcb12->control.exit_int_info, + vmcb12->control.exit_int_info_err, KVM_ISA_SVM); kvm_vcpu_unmap(&svm->vcpu, &map, true); @@ -688,6 +701,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +int svm_allocate_nested(struct vcpu_svm *svm) +{ + struct page *hsave_page; + + if (svm->nested.initialized) + return 0; + + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!hsave_page) + return -ENOMEM; + svm->nested.hsave = page_address(hsave_page); + + svm->nested.msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->nested.msrpm) + goto err_free_hsave; + svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); + + svm->nested.initialized = true; + return 0; + +err_free_hsave: + __free_page(hsave_page); + return -ENOMEM; +} + +void svm_free_nested(struct vcpu_svm *svm) +{ + if (!svm->nested.initialized) + return; + + svm_vcpu_free_msrpm(svm->nested.msrpm); + svm->nested.msrpm = NULL; + + __free_page(virt_to_page(svm->nested.hsave)); + svm->nested.hsave = NULL; + + svm->nested.initialized = false; +} + /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ @@ -702,6 +754,8 @@ void svm_leave_nested(struct vcpu_svm *svm) copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); } + + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -709,7 +763,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -736,7 +790,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -767,14 +821,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.ctl.intercept_cr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.ctl.intercept_dr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -792,8 +844,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.ctl.intercept & exit_bits) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -833,7 +884,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm) { unsigned int nr = svm->vcpu.arch.exception.nr; - return (svm->nested.ctl.intercept_exceptions & (1 << nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); } static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) @@ -901,7 +952,7 @@ static void nested_svm_intr(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static void nested_svm_init(struct vcpu_svm *svm) @@ -982,7 +1033,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) + if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) @@ -1020,7 +1072,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, /* First fill in the header and copy it out. */ if (is_guest_mode(vcpu)) { - kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -1094,7 +1146,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { svm_leave_nested(svm); - goto out_set_gif; + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; } if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) @@ -1143,16 +1196,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; + svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); - if (!nested_svm_vmrun_msrpm(svm)) - return -EINVAL; - -out_set_gif: - svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: kfree(save); @@ -1163,6 +1211,7 @@ out_free: struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 3c9a45efdd4d..c0b14106258a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -447,10 +447,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) } /* - * The LAUNCH_UPDATE command will perform in-place encryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in + * place; the cache may contain the data that was written unencrypted. */ sev_clflush_pages(inpages, npages); @@ -806,10 +804,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) } /* - * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify + * the pages; flush the destination too so that future accesses do not + * see stale data. */ sev_clflush_pages(src_p, 1); sev_clflush_pages(dst_p, 1); @@ -857,7 +854,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; - unsigned long n; + unsigned long n, i; int ret, offset; if (!sev_guest(kvm)) @@ -871,6 +868,12 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return PTR_ERR(pages); /* + * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in + * place; the cache may contain the data that was written unencrypted. + */ + sev_clflush_pages(pages, n); + + /* * The secret must be copied into contiguous memory region, lets verify * that userspace memory pages are contiguous before we issue command. */ @@ -915,6 +918,11 @@ e_free_blob: e_free: kfree(data); e_unpin_memory: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < n; i++) { + set_page_dirty_lock(pages[i]); + mark_page_accessed(pages[i]); + } sev_unpin_memory(kvm, pages, n); return ret; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9709c98d0d6c..2f32fd09e259 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ -} direct_access_msrs[] = { +} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 @@ -263,9 +263,10 @@ static int get_max_npt_level(void) #endif } -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_svm *svm = to_svm(vcpu); + u64 old_efer = vcpu->arch.efer; vcpu->arch.efer = efer; if (!npt_enabled) { @@ -276,13 +277,32 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); - svm_set_gif(svm, true); + if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + + /* + * Free the nested guest state, unless we are in SMM. + * In this case we will return to the nested guest + * as soon as we leave SMM. + */ + if (!is_smm(&svm->vcpu)) + svm_free_nested(svm); + + } else { + int ret = svm_allocate_nested(svm); + + if (ret) { + vcpu->arch.efer = old_efer; + return ret; + } + } } svm->vmcb->save.efer = efer | EFER_SVME; vmcb_mark_dirty(svm->vmcb, VMCB_CR); + return 0; } static int is_external_interrupt(u32 info) @@ -553,18 +573,44 @@ free_cpu_data: } -static bool valid_msr_intercept(u32 index) +static int direct_access_msr_slot(u32 msr) { - int i; + u32 i; for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == index) - return true; + if (direct_access_msrs[i].index == msr) + return i; - return false; + return -ENOENT; +} + +static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, + int write) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int slot = direct_access_msr_slot(msr); + + if (slot == -ENOENT) + return; + + /* Set the shadow bitmaps to the desired intercept states */ + if (read) + set_bit(slot, svm->shadow_msr_intercept.read); + else + clear_bit(slot, svm->shadow_msr_intercept.read); + + if (write) + set_bit(slot, svm->shadow_msr_intercept.write); + else + clear_bit(slot, svm->shadow_msr_intercept.write); } -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) +static bool valid_msr_intercept(u32 index) +{ + return direct_access_msr_slot(index) != -ENOENT; +} + +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { u8 bit_write; unsigned long tmp; @@ -583,8 +629,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) return !!test_bit(bit_write, &tmp); } -static void set_msr_interception(u32 *msrpm, unsigned msr, - int read, int write) +static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, + u32 msr, int read, int write) { u8 bit_read, bit_write; unsigned long tmp; @@ -596,6 +642,13 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, */ WARN_ON(!valid_msr_intercept(msr)); + /* Enforce non allowed MSRs to trap */ + if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + read = 0; + + if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + write = 0; + offset = svm_msrpm_offset(msr); bit_read = 2 * (msr & 0x0f); bit_write = 2 * (msr & 0x0f) + 1; @@ -609,17 +662,60 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, msrpm[offset] = tmp; } -static void svm_vcpu_init_msrpm(u32 *msrpm) +static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) { - int i; + set_shadow_msr_intercept(vcpu, msr, read, write); + set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); +} + +u32 *svm_vcpu_alloc_msrpm(void) +{ + struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); + u32 *msrpm; + + if (!pages) + return NULL; + msrpm = page_address(pages); memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + return msrpm; +} + +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +{ + int i; + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { if (!direct_access_msrs[i].always) continue; + set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); + } +} - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + +void svm_vcpu_free_msrpm(u32 *msrpm) +{ + __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); +} + +static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 i; + + /* + * Set intercept permissions for all direct access MSRs again. They + * will automatically get filtered through the MSR filter, so we are + * back in sync after this. + */ + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 msr = direct_access_msrs[i].index; + u32 read = test_bit(i, svm->shadow_msr_intercept.read); + u32 write = test_bit(i, svm->shadow_msr_intercept.write); + + set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); } } @@ -666,26 +762,26 @@ static void init_msrpm_offsets(void) } } -static void svm_enable_lbrv(struct vcpu_svm *svm) +static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -static void svm_disable_lbrv(struct vcpu_svm *svm) +static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -813,6 +909,9 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* Enable INVPCID feature */ + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); } static __init int svm_hardware_setup(void) @@ -985,6 +1084,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) return svm->vmcb->control.tsc_offset; } +static void svm_check_invpcid(struct vcpu_svm *svm) +{ + /* + * Intercept INVPCID instruction only if shadow page table is + * enabled. Interception is not required with nested page table + * enabled. + */ + if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { + if (!npt_enabled) + svm_set_intercept(svm, INTERCEPT_INVPCID); + else + svm_clr_intercept(svm, INTERCEPT_INVPCID); + } +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -992,14 +1106,14 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vcpu.arch.hflags = 0; - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR3_READ); - set_cr_intercept(svm, INTERCEPT_CR4_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - set_cr_intercept(svm, INTERCEPT_CR3_WRITE); - set_cr_intercept(svm, INTERCEPT_CR4_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR3_READ); + svm_set_intercept(svm, INTERCEPT_CR4_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR3_WRITE); + svm_set_intercept(svm, INTERCEPT_CR4_WRITE); if (!kvm_vcpu_apicv_active(&svm->vcpu)) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1094,15 +1208,15 @@ static void init_vmcb(struct vcpu_svm *svm) control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); - clr_cr_intercept(svm, INTERCEPT_CR3_READ); - clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR3_READ); + svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; } svm->asid_generation = 0; - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; svm->vcpu.arch.hflags = 0; if (!kvm_pause_in_guest(svm->vcpu.kvm)) { @@ -1114,6 +1228,8 @@ static void init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + svm_check_invpcid(svm); + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); @@ -1171,35 +1287,20 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; - struct page *page; - struct page *msrpm_pages; - struct page *hsave_page; - struct page *nested_msrpm_pages; + struct page *vmcb_page; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); err = -ENOMEM; - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) + vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb_page) goto out; - msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) - goto free_page1; - - nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) - goto free_page2; - - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!hsave_page) - goto free_page3; - err = avic_init_vcpu(svm); if (err) - goto free_page4; + goto error_free_vmcb_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1207,18 +1308,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) svm->avic_is_running = true; - svm->nested.hsave = page_address(hsave_page); - clear_page(svm->nested.hsave); - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + svm->msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->msrpm) + goto error_free_vmcb_page; - svm->nested.msrpm = page_address(nested_msrpm_pages); - svm_vcpu_init_msrpm(svm->nested.msrpm); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb = page_address(page); - clear_page(svm->vmcb); - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); + svm->vmcb = page_address(vmcb_page); + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1227,14 +1324,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) return 0; -free_page4: - __free_page(hsave_page); -free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); -free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); -free_page1: - __free_page(page); +error_free_vmcb_page: + __free_page(vmcb_page); out: return err; } @@ -1258,10 +1349,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) */ svm_clear_current_vmcb(svm->vmcb); + svm_free_nested(svm); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1549,11 +1640,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { - clr_cr_intercept(svm, INTERCEPT_CR0_READ); - clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); } else { - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); } } @@ -2224,12 +2315,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, { unsigned long cr0 = svm->vcpu.arch.cr0; bool ret = false; - u64 intercept; - - intercept = svm->nested.ctl.intercept; if (!is_guest_mode(&svm->vcpu) || - (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2267,6 +2355,7 @@ static int cr_interception(struct vcpu_svm *svm) if (cr >= 16) { /* mov to cr */ cr -= 16; val = kvm_register_read(&svm->vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: if (!check_selective_cr0_intercepted(svm, val)) @@ -2312,6 +2401,7 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } kvm_register_write(&svm->vcpu, reg, val); + trace_kvm_cr_read(cr, val); } return kvm_complete_insn_gp(&svm->vcpu, err); } @@ -2562,7 +2652,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && @@ -2577,7 +2667,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -2641,9 +2731,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb->save.dbgctl = data; vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) - svm_enable_lbrv(svm); + svm_enable_lbrv(vcpu); else - svm_disable_lbrv(svm); + svm_disable_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: svm->nested.hsave_msr = data; @@ -2739,6 +2829,33 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +static int invpcid_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + unsigned long type; + gva_t gva; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* + * For an INVPCID intercept: + * EXITINFO1 provides the linear address of the memory operand. + * EXITINFO2 provides the contents of the register operand. + */ + type = svm->vmcb->control.exit_info_2; + gva = svm->vmcb->control.exit_info_1; + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_handle_invpcid(vcpu, type, gva); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -2801,6 +2918,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -2819,12 +2937,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) } pr_err("VMCB Control Area:\n"); - pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); - pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); - pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); - pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); - pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); - pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); + pr_err("%-20s%08x %08x\n", "intercepts:", + control->intercepts[INTERCEPT_WORD3], + control->intercepts[INTERCEPT_WORD4]); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); pr_err("%-20s%d\n", "pause filter threshold:", control->pause_filter_thresh); @@ -2923,12 +3043,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; *info1 = control->exit_info_1; *info2 = control->exit_info_2; + *intr_info = control->exit_int_info; + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->exit_int_info_err; + else + *error_code = 0; } static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) @@ -2939,7 +3066,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; @@ -2947,12 +3074,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); + trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3062,13 +3184,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (nested_svm_virtualize_tpr(vcpu)) return; - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); if (irr == -1) return; if (tpr >= irr) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } bool svm_nmi_blocked(struct kvm_vcpu *vcpu) @@ -3256,7 +3378,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (nested_svm_virtualize_tpr(vcpu)) return; - if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { + if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); } @@ -3353,8 +3475,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -3419,7 +3540,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3460,9 +3580,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -3542,8 +3660,11 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); - exit_fastpath = svm_exit_handlers_fastpath(vcpu); - return exit_fastpath; + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + return svm_exit_handlers_fastpath(vcpu); } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, @@ -3629,6 +3750,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + /* Check again if INVPCID interception if required */ + svm_check_invpcid(svm); + if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -3743,7 +3867,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - u64 intercept; if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; @@ -3752,9 +3875,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - intercept = svm->nested.ctl.intercept; - - if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + if (!(vmcb_is_intercept(&svm->nested.ctl, + INTERCEPT_SELECTIVE_CR0))) break; cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; @@ -3889,7 +4011,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) /* FED8h - SVM Guest */ put_smstate(u64, smstate, 0x7ed8, 1); /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -3911,7 +4033,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); - u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); + u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); if (guest) { if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) @@ -3921,10 +4043,13 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 1; if (kvm_vcpu_map(&svm->vcpu, - gpa_to_gfn(vmcb), &map) == -EINVAL) + gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + return 1; + + if (svm_allocate_nested(svm)) return 1; - ret = enter_svm_guest_mode(svm, vmcb, map.hva); + ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); kvm_vcpu_unmap(&svm->vcpu, &map, true); } } @@ -3945,19 +4070,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { - unsigned long cr4 = kvm_read_cr4(vcpu); - bool smep = cr4 & X86_CR4_SMEP; - bool smap = cr4 & X86_CR4_SMAP; - bool is_user = svm_get_cpl(vcpu) == 3; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + bool smep, smap, is_user; + unsigned long cr4; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -3999,6 +4115,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) * instruction pointer so we will not able to workaround it. Lets * print the error and request to kill the guest. */ + if (likely(!insn || insn_len)) + return true; + + /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + + cr4 = kvm_read_cr4(vcpu); + smep = cr4 & X86_CR4_SMEP; + smap = cr4 & X86_CR4_SMAP; + is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { if (!sev_guest(vcpu->kvm)) return true; @@ -4022,7 +4152,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) * if an INIT signal is pending. */ return !gif_set(svm) || - (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); + (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } static void svm_vm_destroy(struct kvm *kvm) @@ -4160,9 +4290,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + .can_emulate_instruction = svm_can_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, + + .msr_filter_changed = svm_msr_filter_changed, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a798e1731709..1d853fe4c778 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = { #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +#define MAX_DIRECT_ACCESS_MSRS 15 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -85,8 +86,7 @@ struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; - u64 vmcb; - u32 host_intercept_exceptions; + u64 vmcb12_gpa; /* These are the merged vectors */ u32 *msrpm; @@ -97,6 +97,8 @@ struct svm_nested_state { /* cache for control fields of the guest */ struct vmcb_control_area ctl; + + bool initialized; }; struct vcpu_svm { @@ -158,6 +160,12 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; + + /* Save desired MSR intercept (read: pass-through) state */ + struct { + DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); + DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); + } shadow_msr_intercept; }; struct svm_cpu_data { @@ -214,51 +222,44 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) return svm->vmcb; } -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) +static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr |= (1U << bit); - - recalc_intercepts(svm); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __set_bit(bit, (unsigned long *)&control->intercepts); } -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) +static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr &= ~(1U << bit); - - recalc_intercepts(svm); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __clear_bit(bit, (unsigned long *)&control->intercepts); } -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) +static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb->control.intercept_cr & (1U << bit); + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); } static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) - | (1 << INTERCEPT_DR1_READ) - | (1 << INTERCEPT_DR2_READ) - | (1 << INTERCEPT_DR3_READ) - | (1 << INTERCEPT_DR4_READ) - | (1 << INTERCEPT_DR5_READ) - | (1 << INTERCEPT_DR6_READ) - | (1 << INTERCEPT_DR7_READ) - | (1 << INTERCEPT_DR0_WRITE) - | (1 << INTERCEPT_DR1_WRITE) - | (1 << INTERCEPT_DR2_WRITE) - | (1 << INTERCEPT_DR3_WRITE) - | (1 << INTERCEPT_DR4_WRITE) - | (1 << INTERCEPT_DR5_WRITE) - | (1 << INTERCEPT_DR6_WRITE) - | (1 << INTERCEPT_DR7_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } @@ -267,25 +268,27 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = 0; + vmcb->control.intercepts[INTERCEPT_DR] = 0; recalc_intercepts(svm); } -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions |= (1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions &= ~(1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } @@ -294,7 +297,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept |= (1ULL << bit); + vmcb_set_intercept(&vmcb->control, bit); recalc_intercepts(svm); } @@ -303,14 +306,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept &= ~(1ULL << bit); + vmcb_clr_intercept(&vmcb->control, bit); recalc_intercepts(svm); } static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { - return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; + return vmcb_is_intercept(&svm->vmcb->control, bit); } static inline bool vgif_enabled(struct vcpu_svm *svm) @@ -345,11 +348,15 @@ static inline bool gif_set(struct vcpu_svm *svm) /* svm.c */ #define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U #define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U -#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); +u32 *svm_vcpu_alloc_msrpm(void); +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); +void svm_vcpu_free_msrpm(u32 *msrpm); + +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -374,22 +381,24 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); +void svm_free_nested(struct vcpu_svm *svm); +int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b66432b015d2..aef960f90f26 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,18 +15,20 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) + __field( unsigned long, rip ) ), TP_fast_assign( - __entry->vcpu_id = vcpu_id; + __entry->vcpu_id = vcpu->vcpu_id; + __entry->rip = kvm_rip_read(vcpu); ), - TP_printk("vcpu %u", __entry->vcpu_id) + TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ); /* @@ -233,36 +235,45 @@ TRACE_EVENT(kvm_apic, (isa == KVM_ISA_VMX) ? \ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" +#define TRACE_EVENT_KVM_EXIT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(exit_reason, vcpu, isa), \ + \ + TP_STRUCT__entry( \ + __field( unsigned int, exit_reason ) \ + __field( unsigned long, guest_rip ) \ + __field( u32, isa ) \ + __field( u64, info1 ) \ + __field( u64, info2 ) \ + __field( u32, intr_info ) \ + __field( u32, error_code ) \ + __field( unsigned int, vcpu_id ) \ + ), \ + \ + TP_fast_assign( \ + __entry->exit_reason = exit_reason; \ + __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->isa = isa; \ + __entry->vcpu_id = vcpu->vcpu_id; \ + kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \ + &__entry->info2, \ + &__entry->intr_info, \ + &__entry->error_code); \ + ), \ + \ + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ + __entry->vcpu_id, \ + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ + __entry->guest_rip, __entry->info1, __entry->info2, \ + __entry->intr_info, __entry->error_code) \ +) + /* * Tracepoint for kvm guest exit: */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - __entry->vcpu_id = vcpu->vcpu_id; - kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, - &__entry->info2); - ), - - TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", - __entry->vcpu_id, - kvm_print_exit_reason(__entry->exit_reason, __entry->isa), - __entry->guest_rip, __entry->info1, __entry->info2) -); +TRACE_EVENT_KVM_EXIT(kvm_exit); /* * Tracepoint for kvm interrupt injection: @@ -544,63 +555,38 @@ TRACE_EVENT(kvm_nested_vmrun, ); TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, + __u32 intercept1, __u32 intercept2, __u32 intercept3), + TP_ARGS(cr_read, cr_write, exceptions, intercept1, + intercept2, intercept3), TP_STRUCT__entry( __field( __u16, cr_read ) __field( __u16, cr_write ) __field( __u32, exceptions ) - __field( __u64, intercept ) + __field( __u32, intercept1 ) + __field( __u32, intercept2 ) + __field( __u32, intercept3 ) ), TP_fast_assign( __entry->cr_read = cr_read; __entry->cr_write = cr_write; __entry->exceptions = exceptions; - __entry->intercept = intercept; + __entry->intercept1 = intercept1; + __entry->intercept2 = intercept2; + __entry->intercept3 = intercept3; ), - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) + TP_printk("cr_read: %04x cr_write: %04x excp: %08x " + "intercepts: %08x %08x %08x", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept1, __entry->intercept2, __entry->intercept3) ); /* * Tracepoint for #VMEXIT while nested */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - kvm_print_exit_reason(__entry->exit_code, __entry->isa), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); +TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); /* * Tracepoint for #VMEXIT reinjected to the guest diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4bbd8b448d22..3a1861403d73 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void) static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_ENABLE_RDTSCP; } static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) @@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vmx_rdrand_supported(void) +static inline bool cpu_has_vmx_rdrand(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDRAND_EXITING; @@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void) SECONDARY_EXEC_ENCLS_EXITING; } -static inline bool vmx_rdseed_supported(void) +static inline bool cpu_has_vmx_rdseed(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDSEED_EXITING; @@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } -static inline bool vmx_xsaves_supported(void) +static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_XSAVES; } -static inline bool vmx_waitpkg_supported(void) +static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 19e2265956ba..89af692deb7e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs = NULL; } +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; + int cpu; + + if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) + return; + + cpu = get_cpu(); + prev = vmx->loaded_vmcs; + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_sync_vmcs_host_state(vmx, prev); + put_cpu(); + + vmx_register_cache_reset(vcpu); +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -241,10 +279,13 @@ static void free_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; @@ -277,44 +318,6 @@ static void free_nested(struct kvm_vcpu *vcpu) free_loaded_vmcs(&vmx->nested.vmcs02); } -static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, - struct loaded_vmcs *prev) -{ - struct vmcs_host_state *dest, *src; - - if (unlikely(!vmx->guest_state_loaded)) - return; - - src = &prev->host_state; - dest = &vmx->loaded_vmcs->host_state; - - vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); - dest->ldt_sel = src->ldt_sel; -#ifdef CONFIG_X86_64 - dest->ds_sel = src->ds_sel; - dest->es_sel = src->es_sel; -#endif -} - -static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *prev; - int cpu; - - if (vmx->loaded_vmcs == vmcs) - return; - - cpu = get_cpu(); - prev = vmx->loaded_vmcs; - vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); - vmx_sync_vmcs_host_state(vmx, prev); - put_cpu(); - - vmx_register_cache_reset(vcpu); -} - /* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested(). @@ -323,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); vmx_leave_nested(vcpu); - vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); - free_nested(vcpu); vcpu_put(vcpu); } @@ -938,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { - int index = vmx_find_msr_index(&vmx->msr_autostore.guest, - MSR_IA32_TSC); + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + MSR_IA32_TSC); - if (index >= 0) { - u64 val = vmx->msr_autostore.guest.val[index].value; + if (i >= 0) { + u64 val = vmx->msr_autostore.guest.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; @@ -1031,16 +1032,16 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msrs *autostore = &vmx->msr_autostore.guest; bool in_vmcs12_store_list; - int msr_autostore_index; + int msr_autostore_slot; bool in_autostore_list; int last; - msr_autostore_index = vmx_find_msr_index(autostore, msr_index); - in_autostore_list = msr_autostore_index >= 0; + msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); + in_autostore_list = msr_autostore_slot >= 0; in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == NR_LOADSTORE_MSRS) { + if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by @@ -1057,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, autostore->val[last].index = msr_index; } else if (!in_vmcs12_store_list && in_autostore_list) { last = --autostore->nr; - autostore->val[msr_autostore_index] = autostore->val[last]; + autostore->val[msr_autostore_slot] = autostore->val[last]; } } @@ -2286,7 +2287,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2314,6 +2315,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + secondary_exec_controls_set(vmx, exec_control); } @@ -2408,6 +2412,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmx->segment_cache.bitmask = 0; } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & @@ -2571,7 +2577,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. */ - if (vmx->emulation_required) { + if (CC(!vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -3344,8 +3350,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - if (unlikely(!nested_get_vmcs12_pages(vcpu))) + if (unlikely(!nested_get_vmcs12_pages(vcpu))) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR; + } if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); @@ -3387,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset. */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } /* @@ -3468,11 +3476,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { return nested_vmx_failInvalid(vcpu); } - if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3483,7 +3491,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ - if (vmcs12->hdr.shadow_vmcs) + if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); if (vmx->nested.hv_evmcs) { @@ -3504,10 +3512,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - if (vmcs12->launch_state == launch) + if (CC(vmcs12->launch_state == launch)) return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); @@ -3528,6 +3536,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; + /* Emulate processing of posted interrupts on VM-Enter. */ + if (nested_cpu_has_posted_intr(vmcs12) && + kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); + } + /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -4257,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) { - struct shared_msr_entry *efer_msr; + struct vmx_uret_msr *efer_msr; unsigned int i; if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) @@ -4271,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmx->msr_autoload.guest.val[i].value; } - efer_msr = find_msr_entry(vmx, MSR_EFER); + efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data; @@ -4696,7 +4712,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) { - *ret = vmx_handle_memory_failure(vcpu, r, &e); + *ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL; } @@ -4760,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); } return 0; @@ -5003,7 +5019,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); @@ -5076,7 +5092,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); @@ -5238,7 +5254,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); return nested_vmx_succeed(vcpu); } @@ -5291,7 +5307,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, @@ -5373,7 +5389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) return nested_vmx_fail(vcpu, @@ -5918,13 +5934,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - exit_intr_info = vmx_get_intr_info(vcpu); - exit_qual = vmx_get_exit_qual(vcpu); - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, - vmx->idt_vectoring_info, exit_intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -5940,14 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + exit_intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); } + exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); @@ -6182,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } else { return -EINVAL; } @@ -6318,7 +6328,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | @@ -6337,7 +6348,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); @@ -6391,7 +6403,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -6561,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_vmcs12_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = nested_get_vmcs12_pages, .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c new file mode 100644 index 000000000000..e4e7adff818c --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm_host.h> + +#include <asm/irq_remapping.h> +#include <asm/cpu.h> + +#include "lapic.h" +#include "posted_intr.h" +#include "trace.h" +#include "vmx.h" + +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + +static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + + /* The full case. */ + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + new.sn = 0; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + +after_clear_sn: + + /* + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. + */ + smp_mb__after_atomic(); + + if (!pi_is_pir_empty(pi_desc)) + pi_set_on(pi_desc); +} + +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); +} + +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +int pi_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return 0; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } + + do { + old.control = new.control = pi_desc->control; + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); +} + +void pi_post_block(struct kvm_vcpu *vcpu) +{ + if (vcpu->pre_pcpu == -1) + return; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +void pi_wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +void __init pi_init(int cpu) +{ + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); +} + + +/* + * pi_update_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = 0; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. + */ + + kvm_set_msi_irq(kvm, e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + + continue; + } + + vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else + ret = irq_set_vcpu_affinity(host_irq, NULL); + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h new file mode 100644 index 000000000000..e53b97f82097 --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_POSTED_INTR_H +#define __KVM_X86_VMX_POSTED_INTR_H + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); +int pi_pre_block(struct kvm_vcpu *vcpu); +void pi_post_block(struct kvm_vcpu *vcpu); +void pi_wakeup_handler(void); +void __init pi_init(int cpu); +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set); + +#endif /* __KVM_X86_VMX_POSTED_INTR_H */
\ No newline at end of file diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7a3675fddec2..1472c6c376f7 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_EXT_INTR); } +static inline bool is_exception_with_error_code(u32 intr_info) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK; + + return (intr_info & mask) == mask; +} + enum vmcs_field_width { VMCS_FIELD_WIDTH_U16 = 0, VMCS_FIELD_WIDTH_U64 = 1, diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 799db084a336..90ad7a6246e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -4,6 +4,7 @@ #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> +#include <asm/segment.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -294,3 +295,36 @@ SYM_FUNC_START(vmread_error_trampoline) ret SYM_FUNC_END(vmread_error_trampoline) + +SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + CALL_NOSPEC _ASM_ARG1 + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + ret +SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f0a9954c49db..ab6d2d1525ec 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -56,7 +56,6 @@ #include "lapic.h" #include "mmu.h" #include "nested.h" -#include "ops.h" #include "pmu.h" #include "trace.h" #include "vmcs.h" @@ -149,8 +148,25 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) -#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ - (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) +/* + * List of MSRs that can be directly passed to the guest. + * In addition to these x2apic and PT MSRs are handled specially. + */ +static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_TSC, + MSR_FS_BASE, + MSR_GS_BASE, + MSR_KERNEL_GS_BASE, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_CORE_C1_RES, + MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, + MSR_CORE_C7_RESIDENCY, +}; /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: @@ -344,9 +360,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); -static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); void vmx_vmexit(void); @@ -401,13 +416,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs); */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -/* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. - */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -450,9 +458,9 @@ static unsigned long host_idt_base; * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, IA32_STAR must always be included in - * vmx_msr_index[], even in i386 builds. + * vmx_uret_msrs_list[], even in i386 builds. */ -const u32 vmx_msr_index[] = { +static const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif @@ -626,36 +634,71 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +static int possible_passthrough_msr_slot(u32 msr) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + + return -ENOENT; +} + +static bool is_valid_passthrough_msr(u32 msr) +{ + bool r; + + switch (msr) { + case 0x800 ... 0x8ff: + /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ + return true; + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + return true; + } + + r = possible_passthrough_msr_slot(msr) != -ENOENT; + + WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + + return r; +} + +static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + for (i = 0; i < vmx->nr_uret_msrs; ++i) + if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) return i; return -1; } -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __find_msr_index(vmx, msr); + i = __vmx_find_uret_msr(vmx, msr); if (i >= 0) - return &vmx->guest_msrs[i]; + return &vmx->guest_uret_msrs[i]; return NULL; } -static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, + struct vmx_uret_msr *msr, u64 data) { int ret = 0; u64 old_msr_data = msr->data; msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; @@ -840,7 +883,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -874,7 +917,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -882,7 +925,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = vmx_find_msr_index(&m->host, msr); + i = vmx_find_loadstore_msr_slot(&m->host, msr); if (i < 0) return; @@ -941,12 +984,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only) - j = vmx_find_msr_index(&m->host, msr); + j = vmx_find_loadstore_msr_slot(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { + if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -969,10 +1012,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->host.val[j].value = host_val; } -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; + int i; /* Shadow paging assumes NX to be available. */ if (!enable_ept) @@ -1004,17 +1048,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) else clear_atomic_switch_msr(vmx, MSR_EFER); return false; - } else { - clear_atomic_switch_msr(vmx, MSR_EFER); + } + + i = __vmx_find_uret_msr(vmx, MSR_EFER); + if (i < 0) + return false; - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + clear_atomic_switch_msr(vmx, MSR_EFER); - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; - return true; - } + vmx->guest_uret_msrs[i].data = guest_efer; + vmx->guest_uret_msrs[i].mask = ~ignore_bits; + + return true; } #ifdef CONFIG_X86_32 @@ -1052,6 +1100,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } +static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) +{ + /* The base must be 128-byte aligned and a legal physical address. */ + return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -1156,12 +1210,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ - if (!vmx->guest_msrs_ready) { - vmx->guest_msrs_ready = true; - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); + if (!vmx->guest_uret_msrs_loaded) { + vmx->guest_uret_msrs_loaded = true; + for (i = 0; i < vmx->nr_active_uret_msrs; ++i) + kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, + vmx->guest_uret_msrs[i].data, + vmx->guest_uret_msrs[i].mask); } @@ -1245,7 +1299,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; - vmx->guest_msrs_ready = false; + vmx->guest_uret_msrs_loaded = false; } #ifdef CONFIG_X86_64 @@ -1268,62 +1322,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. - */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - - /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; - } - - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - -after_clear_sn: - - /* - * Clear SN before reading the bitmap. The VT-d firmware - * writes the bitmap and reads SN atomically (5.2.3 in the - * spec), so it doesn't really have a memory barrier that - * pairs with this, but we cannot do that and we need one. - */ - smp_mb__after_atomic(); - - if (!pi_is_pir_empty(pi_desc)) - pi_set_on(pi_desc); -} - void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1407,20 +1405,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1430,7 +1414,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static bool emulation_required(struct kvm_vcpu *vcpu) { - return emulate_invalid_guest_state && !guest_state_valid(vcpu); + return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -1456,7 +1440,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; - if (enable_unrestricted_guest) { + if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); @@ -1576,6 +1560,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +{ + return true; +} + static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip, orig_rip; @@ -1614,33 +1603,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) } /* - * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns - * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value - * indicates whether exit to userspace is needed. - */ -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e) -{ - if (r == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_emulated_page_fault(vcpu, e); - return 1; - } - - /* - * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED - * while handling a VMX instruction KVM could've handled the request - * correctly by exiting to userspace and performing I/O but there - * doesn't seem to be a real use-case behind such requests, just return - * KVM_EXIT_INTERNAL_ERROR for now. - */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - - return 0; -} - -/* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ @@ -1723,16 +1685,19 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) { - struct shared_msr_entry tmp; + struct vmx_uret_msr tmp; + int from, to; - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; + from = __vmx_find_uret_msr(vmx, msr); + if (from < 0) + return; + to = vmx->nr_active_uret_msrs++; + + tmp = vmx->guest_uret_msrs[to]; + vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; + vmx->guest_uret_msrs[from] = tmp; } /* @@ -1742,38 +1707,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs, index; - - save_nmsrs = 0; + vmx->guest_uret_msrs_loaded = false; + vmx->nr_active_uret_msrs = 0; #ifdef CONFIG_X86_64 /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { - index = __find_msr_index(vmx, MSR_STAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + vmx_setup_uret_msr(vmx, MSR_STAR); + vmx_setup_uret_msr(vmx, MSR_LSTAR); + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); } #endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_ready = false; + if (update_transition_efer(vmx)) + vmx_setup_uret_msr(vmx, MSR_EFER); + + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + vmx_setup_uret_msr(vmx, MSR_TSC_AUX); + + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); @@ -1843,7 +1796,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; u32 index; switch (msr_info->index) { @@ -1864,7 +1817,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1971,10 +1924,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: - msr = find_msr_entry(vmx, msr_info->index); + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; @@ -2003,7 +1956,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; @@ -2097,7 +2050,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; @@ -2107,7 +2060,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2133,8 +2086,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, - MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) @@ -2184,7 +2136,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) @@ -2209,7 +2161,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; - if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) + if (!pt_output_base_valid(vcpu, data)) return 1; vmx->pt_desc.guest.output_base = data; break; @@ -2244,13 +2196,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: - msr = find_msr_entry(vmx, msr_index); + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_index); if (msr) - ret = vmx_set_guest_msr(vmx, msr, data); + ret = vmx_set_guest_uret_msr(vmx, msr, data); else ret = kvm_set_msr_common(vcpu, msr_info); } @@ -2282,7 +2234,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + if (is_unrestricted_guest(vcpu) || + (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2463,7 +2416,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -2877,13 +2830,14 @@ static void enter_rmode(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); + /* Nothing to do if hardware doesn't support EFER. */ if (!msr) - return; + return 0; vcpu->arch.efer = efer; if (efer & EFER_LMA) { @@ -2895,6 +2849,7 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } setup_msrs(vmx); + return 0; } #ifdef CONFIG_X86_64 @@ -3048,7 +3003,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) unsigned long hw_cr0; hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; @@ -3069,7 +3024,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !enable_unrestricted_guest) + if (enable_ept && !is_unrestricted_guest(vcpu)) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -3149,7 +3104,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long hw_cr4; hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; @@ -3184,7 +3139,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); - if (!enable_unrestricted_guest) { + if (!is_unrestricted_guest(vcpu)) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; @@ -3324,7 +3279,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); @@ -3513,11 +3468,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) * not. * We assume that registers are always usable */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) { - if (enable_unrestricted_guest) - return true; - /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) @@ -3703,11 +3655,52 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) +static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + if (!cpu_has_vmx_msr_bitmap()) return; @@ -3715,36 +3708,44 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filters change. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); + } + } - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); + if ((type & MSR_TYPE_R) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { + vmx_set_msr_bitmap_read(msr_bitmap, msr); + type &= ~MSR_TYPE_R; + } - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); + if ((type & MSR_TYPE_W) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { + vmx_set_msr_bitmap_write(msr_bitmap, msr); + type &= ~MSR_TYPE_W; + } - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); + if (type & MSR_TYPE_R) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); - } + if (type & MSR_TYPE_W) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { - int f = sizeof(unsigned long); + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; if (!cpu_has_vmx_msr_bitmap()) return; @@ -3753,39 +3754,34 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filter changes. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); + } + } - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); + if (type & MSR_TYPE_R) + vmx_set_msr_bitmap_read(msr_bitmap, msr); - } + if (type & MSR_TYPE_W) + vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type, bool value) +static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value) { if (value) - vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + vmx_enable_intercept_for_msr(vcpu, msr, type); else - vmx_disable_intercept_for_msr(msr_bitmap, msr, type); + vmx_disable_intercept_for_msr(vcpu, msr, type); } static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) @@ -3803,35 +3799,47 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, - u8 mode) +static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { + unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + unsigned long read_intercept; int msr; + read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + unsigned int read_idx = msr / BITS_PER_LONG; + unsigned int write_idx = read_idx + (0x800 / sizeof(long)); + + msr_bitmap[read_idx] = read_intercept; + msr_bitmap[write_idx] = ~0ul; } +} - if (mode & MSR_BITMAP_MODE_X2APIC) { - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); - if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); - } +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +{ + if (!cpu_has_vmx_msr_bitmap()) + return; + + vmx_reset_x2apic_msrs(vcpu, mode); + + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_X2APIC)); + + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; u8 mode = vmx_msr_bitmap_mode(vcpu); u8 changed = mode ^ vmx->msr_bitmap_mode; @@ -3839,30 +3847,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) return; if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + vmx_update_msr_bitmap_x2apic(vcpu, mode); vmx->msr_bitmap_mode = mode; } -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + struct vcpu_vmx *vmx = to_vmx(vcpu); bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); u32 i; - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, - MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); for (i = 0; i < vmx->pt_desc.addr_range; i++) { - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } } @@ -3886,6 +3888,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) return ((rvi & 0xf0) > (vppr & 0xf0)); } +static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 i; + + /* + * Set intercept permissions for all potentially passed through MSRs + * again. They will automatically get filtered through the MSR filter, + * so we are back in sync after this. + */ + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + u32 msr = vmx_possible_passthrough_msrs[i]; + bool read = test_bit(i, vmx->shadow_msr_intercept.read); + bool write = test_bit(i, vmx->shadow_msr_intercept.write); + + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + } + + pt_update_intercept_for_msr(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -4043,13 +4068,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + struct kvm_vcpu *vcpu = &vmx->vcpu; + + vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & + ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + vcpu->arch.cr4_guest_owned_bits &= + ~get_vmcs12(vcpu)->cr4_guest_host_mask; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) @@ -4114,6 +4142,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +/* + * Adjust a single secondary execution control bit to intercept/allow an + * instruction in the guest. This is usually done based on whether or not a + * feature has been exposed to the guest in order to correctly emulate faults. + */ +static inline void +vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, + u32 control, bool enabled, bool exiting) +{ + /* + * If the control is for an opt-in feature, clear the control if the + * feature is not exposed to the guest, i.e. not enabled. If the + * control is opt-out, i.e. an exiting control, clear the control if + * the feature _is_ exposed to the guest, i.e. exiting/interception is + * disabled for the associated instruction. Note, the caller is + * responsible presetting exec_control to set all supported bits. + */ + if (enabled == exiting) + *exec_control &= ~control; + + /* + * Update the nested MSR settings so that a nested VMM can/can't set + * controls for features that are/aren't exposed to the guest. + */ + if (nested) { + if (enabled) + vmx->nested.msrs.secondary_ctls_high |= control; + else + vmx->nested.msrs.secondary_ctls_high &= ~control; + } +} + +/* + * Wrapper macro for the common case of adjusting a secondary execution control + * based on a single guest CPUID bit, with a dedicated feature bit. This also + * verifies that the control is actually supported by KVM and hardware. + */ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + __enabled = guest_cpuid_has(&(vmx)->vcpu, \ + X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, \ + SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ + } \ +}) + +/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ +#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) + +#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { @@ -4154,7 +4237,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (vmx_xsaves_supported()) { + if (cpu_has_vmx_xsaves()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4163,101 +4246,29 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vcpu->arch.xsaves_enabled = xsaves_enabled; - if (!xsaves_enabled) - exec_control &= ~SECONDARY_EXEC_XSAVES; - - if (nested) { - if (xsaves_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_XSAVES; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_XSAVES; - } - } - - if (cpu_has_vmx_rdtscp()) { - bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); - if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; - - if (nested) { - if (rdtscp_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; - } - } - - if (cpu_has_vmx_invpcid()) { - /* Exposing INVPCID only when PCID is exposed */ - bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && - guest_cpuid_has(vcpu, X86_FEATURE_PCID); - - if (!invpcid_enabled) { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); - } - - if (nested) { - if (invpcid_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_INVPCID; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_INVPCID; - } - } - - if (vmx_rdrand_supported()) { - bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); - if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; - - if (nested) { - if (rdrand_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDRAND_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND_EXITING; - } + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_XSAVES, + xsaves_enabled, false); } - if (vmx_rdseed_supported()) { - bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); - if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; + vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); - if (nested) { - if (rdseed_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDSEED_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED_EXITING; - } - } + /* + * Expose INVPCID if and only if PCID is also exposed to the guest. + * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF + * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect + * behavior from the guest perspective (it would expect #GP or #PF). + */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); - if (vmx_waitpkg_supported()) { - bool waitpkg_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); - if (!waitpkg_enabled) - exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); - if (nested) { - if (waitpkg_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - } - } + vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, + ENABLE_USR_WAIT_PAUSE, false); vmx->secondary_exec_control = exec_control; } @@ -4350,7 +4361,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - if (vmx_xsaves_supported()) + if (cpu_has_vmx_xsaves()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { @@ -5154,7 +5165,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0); + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_skip_emulated_instruction(vcpu); } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -5337,7 +5349,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(allow_smaller_maxphyaddr && kvm_mmu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -5448,25 +5460,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - static void vmx_enable_tdp(void) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -5530,16 +5523,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; unsigned long type; - bool pcid_enabled; gva_t gva; - struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; } operand; - int r; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5562,68 +5550,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) sizeof(operand), &gva)) return 1; - r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); - if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); - - if (operand.pcid >> 12 != 0) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - switch (type) { - case INVPCID_TYPE_INDIV_ADDR: - if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_SINGLE_CTXT: - if (!pcid_enabled && (operand.pcid != 0)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_ALL_NON_GLOBAL: - /* - * Currently, KVM doesn't mark global entries in the shadow - * page tables, so a non-global flush just degenerates to a - * global flush. If needed, we could optimize this later by - * keeping track of global entries in shadow page tables. - */ - - fallthrough; - case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); - return kvm_skip_emulated_instruction(vcpu); - - default: - BUG(); /* We have already checked above that type <= 3 */ - } + return kvm_handle_invpcid(vcpu, type, gva); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -5752,10 +5679,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + *info1 = vmx_get_exit_qual(vcpu); - *info2 = vmx_get_intr_info(vcpu); + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + *info2 = vmx->idt_vectoring_info; + *intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + else + *error_code = 0; + } else { + *info2 = 0; + *intr_info = 0; + *error_code = 0; + } } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -6389,14 +6330,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - return pi_test_on(pi_desc) || - (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); -} - static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -6416,70 +6349,43 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } +void vmx_do_interrupt_nmi_irqoff(unsigned long entry); + +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +{ + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; + + kvm_before_interrupt(vcpu); + vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(intr_info)) { + if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(intr_info)) { + else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(intr_info)) { - kvm_before_interrupt(&vmx->vcpu); - asm("int $2"); - kvm_after_interrupt(&vmx->vcpu); - } + else if (is_nmi(intr_info)) + handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { - unsigned int vector; - unsigned long entry; -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif - gate_desc *desc; u32 intr_info = vmx_get_intr_info(vcpu); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - vector = intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)host_idt_base + vector; - entry = gate_offset(desc); - - kvm_before_interrupt(vcpu); - - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%rsp, %[sp]\n\t" - "and $-16, %%rsp\n\t" - "push %[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - "push %[cs]\n\t" - CALL_NOSPEC - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - ASM_CALL_CONSTRAINT - : - [thunk_target]"r"(entry), -#ifdef CONFIG_X86_64 - [ss]"i"(__KERNEL_DS), -#endif - [cs]"i"(__KERNEL_CS) - ); - - kvm_after_interrupt(vcpu); + handle_interrupt_nmi_irqoff(vcpu, intr_info); } -STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { @@ -6806,9 +6712,7 @@ reenter_guest: if (enable_preemption_timer) vmx_update_hv_timer(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -6952,20 +6856,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { + u32 index = vmx_uret_msrs_list[i]; u32 data_low, data_high; - int j = vmx->nmsrs; + int j = vmx->nr_uret_msrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; + vmx->guest_uret_msrs[j].slot = i; + vmx->guest_uret_msrs[j].data = 0; switch (index) { case MSR_IA32_TSX_CTRL: /* @@ -6973,32 +6877,36 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) * let's avoid changing CPUID bits under the host * kernel's feet. */ - vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; break; default: - vmx->guest_msrs[j].mask = -1ull; + vmx->guest_uret_msrs[j].mask = -1ull; break; } - ++vmx->nmsrs; + ++vmx->nr_uret_msrs; } err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_pml; + /* The MSR bitmap starts with all ones */ + bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); if (kvm_cstate_in_guest(vcpu->kvm)) { - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } vmx->msr_bitmap_mode = 0; @@ -7022,8 +6930,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } if (nested) - nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - vmx_capability.ept); + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); @@ -7343,13 +7250,18 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { - struct shared_msr_entry *msr; - msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + struct vmx_uret_msr *msr; + msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); - vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } + + set_cr4_guest_host_mask(vmx); + + /* Refresh #PF interception to account for MAXPHYADDR changes. */ + update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) @@ -7373,14 +7285,14 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0xD.1 */ supported_xss = 0; - if (!vmx_xsaves_supported()) + if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); /* CPUID 0x80000001 */ if (!cpu_has_vmx_rdtscp()) kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); - if (vmx_waitpkg_supported()) + if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } @@ -7436,7 +7348,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, * Because it is marked as EmulateOnUD, we need to intercept it here. */ case x86_intercept_rdtscp: - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; @@ -7568,107 +7480,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } -} - -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) - __pi_post_block(vcpu); - - local_irq_enable(); - return (vcpu->pre_pcpu == -1); -} - static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) @@ -7680,17 +7491,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - if (vcpu->pre_pcpu == -1) - return; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - __pi_post_block(vcpu); - local_irq_enable(); -} - static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) @@ -7699,100 +7499,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) pi_post_block(vcpu); } -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7850,11 +7556,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) /* RSM will cause a vmexit anyway. */ } -static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) -{ - return false; -} - static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon; @@ -7961,7 +7662,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, @@ -7995,7 +7696,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = vmx_update_pi_irte, + .update_pi_irte = pi_update_irte, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -8009,9 +7710,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, }; static __init int hardware_setup(void) @@ -8023,8 +7726,8 @@ static __init int hardware_setup(void) store_idt(&dt); host_idt_base = dt.address; - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; @@ -8161,7 +7864,7 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -8300,8 +8003,8 @@ static int __init vmx_init(void) for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + + pi_init(cpu); } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a0e47720f60c..f6f66e5c6510 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -9,8 +9,9 @@ #include "capabilities.h" #include "kvm_cache_regs.h" -#include "ops.h" +#include "posted_intr.h" #include "vmcs.h" +#include "vmx_ops.h" #include "cpuid.h" extern const u32 vmx_msr_index[]; @@ -22,20 +23,20 @@ extern const u32 vmx_msr_index[]; #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 -#define NR_SHARED_MSRS 7 +#define MAX_NR_USER_RETURN_MSRS 7 #else -#define NR_SHARED_MSRS 4 +#define MAX_NR_USER_RETURN_MSRS 4 #endif -#define NR_LOADSTORE_MSRS 8 +#define MAX_NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; + struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; }; -struct shared_msr_entry { - unsigned index; +struct vmx_uret_msr { + unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ u64 data; u64 mask; }; @@ -49,29 +50,6 @@ enum segment_cache_field { SEG_FIELD_NR = 4 }; -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - #define RTIT_ADDR_RANGE 4 struct pt_ctx { @@ -218,10 +196,10 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct shared_msr_entry guest_msrs[NR_SHARED_MSRS]; - int nmsrs; - int save_nmsrs; - bool guest_msrs_ready; + struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; + int nr_uret_msrs; + int nr_active_uret_msrs; + bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -301,6 +279,13 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + + /* Save desired MSR intercept (read: pass-through) state */ +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 + struct { + DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + } shadow_msr_intercept; }; enum ept_pointers_status { @@ -334,7 +319,7 @@ unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); @@ -343,6 +328,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level); + void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -350,75 +336,12 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e); +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) -{ - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_on(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -499,11 +422,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -558,6 +476,19 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; } +static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) +{ + return enable_unrestricted_guest && (!is_guest_mode(vcpu) || + (secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_UNRESTRICTED_GUEST)); +} + +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu); +static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) +{ + return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..692b0c31c9c8 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce856e0ece84..397f599b20e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -71,6 +71,7 @@ #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> +#include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <clocksource/hyperv_timer.h> @@ -161,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); -#define KVM_NR_SHARED_MSRS 16 +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_shared_msrs_global { +struct kvm_user_return_msrs_global { int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; }; -struct kvm_shared_msrs { +struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; - struct kvm_shared_msr_values { + struct kvm_user_return_msr_values { u64 host; u64 curr; - } values[KVM_NR_SHARED_MSRS]; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; +static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -266,7 +272,7 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, } else { vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", op, msr, data); - return 1; + return -ENOENT; } } @@ -293,9 +299,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; unsigned long flags; /* @@ -303,73 +309,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn) * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); - if (locals->registered) { - locals->registered = false; + if (msrs->registered) { + msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; + for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); + wrmsrl(user_return_msrs_global.msrs[slot], values->host); values->curr = values->host; } } } -void kvm_define_shared_msr(unsigned slot, u32 msr) +void kvm_define_user_return_msr(unsigned slot, u32 msr) { - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; + BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); + user_return_msrs_global.msrs[slot] = msr; + if (slot >= user_return_msrs_global.nr) + user_return_msrs_global.nr = slot + 1; } -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); +EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); -static void kvm_shared_msr_cpu_online(void) +static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; - for (i = 0; i < shared_msrs_global.nr; ++i) { - rdmsrl_safe(shared_msrs_global.msrs[i], &value); - smsr->values[i].host = value; - smsr->values[i].curr = value; + for (i = 0; i < user_return_msrs_global.nr; ++i) { + rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; } } -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; - value = (value & mask) | (smsr->values[slot].host & ~mask); - if (value == smsr->values[slot].curr) + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); if (err) return 1; - smsr->values[slot].curr = value; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; + msrs->values[slot].curr = value; + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - if (smsr->registered) - kvm_on_user_return(&smsr->urn); + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) @@ -1452,6 +1458,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; + int r; if (efer & efer_reserved_bits) return 1; @@ -1468,7 +1475,11 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops.set_efer(vcpu, efer); + r = kvm_x86_ops.set_efer(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1483,6 +1494,40 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + struct kvm *kvm = vcpu->kvm; + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + u32 count = kvm->arch.msr_filter.count; + u32 i; + bool r = kvm->arch.msr_filter.default_allow; + int idx; + + /* MSR filtering not set up or x2APIC enabled, allow everything */ + if (!count || (index >= 0x800 && index <= 0x8ff)) + return true; + + /* Prevent collision with set_msr_filter */ + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + r = !!test_bit(index - start, bitmap); + break; + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_msr_allowed); + /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1494,6 +1539,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return -EPERM; + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1550,6 +1598,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return -EPERM; + msr.index = index; msr.host_initiated = host_initiated; @@ -1585,12 +1636,91 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case -ENOENT: + return KVM_MSR_EXIT_REASON_UNKNOWN; + case -EPERM: + return KVM_MSR_EXIT_REASON_FILTER; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; + + r = kvm_get_msr(vcpu, ecx, &data); - if (kvm_get_msr(vcpu, ecx, &data)) { + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; @@ -1608,8 +1738,21 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) + /* Bounce to user space */ + return 0; + + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; + + /* MSR write failed? Inject a #GP */ + if (r > 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -1775,12 +1918,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; @@ -1788,6 +1925,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) struct pvclock_wall_clock wc; u64 wall_nsec; + kvm->arch.wall_clock = wall_clock; + if (!wall_clock) return; @@ -1820,6 +1959,34 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock && old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; + if (!(system_time & 1)) + return; + + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = true; + + return; +} + static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); @@ -1979,12 +2146,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } -static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) -{ - u64 curr_offset = vcpu->arch.l1_tsc_offset; - vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; -} - /* * Multiply tsc by a fixed point number represented by ratio. * @@ -2046,14 +2207,13 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched; bool already_matched; - u64 data = msr->data; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2062,7 +2222,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - if (data == 0 && msr->host_initiated) { + if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep @@ -2132,9 +2292,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) - update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -2149,8 +2306,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); - static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { @@ -2696,24 +2851,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; u8 *page; - int r; - r = -E2BIG; if (page_num >= blob_size) - goto out; - r = -ENOMEM; + return 1; + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; + return 0; } static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) @@ -2731,6 +2881,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; @@ -2808,10 +2966,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb_guest(vcpu); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + } vcpu->arch.st.preempted = 0; @@ -2945,7 +3105,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, data); + } else { + u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && @@ -2966,53 +3132,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - struct kvm_arch *ka = &vcpu->kvm->arch; - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(data & 1)) - break; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); + break; + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; - } case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -3029,11 +3196,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -3229,7 +3402,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * state this but appears to behave the same. * * On userspace reads and writes, however, we unconditionally - * operate L1's TSC value to ensure backwards-compatible + * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : @@ -3527,6 +3700,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4397,6 +4573,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_x86_ops.enable_direct_tlbflush(vcpu); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + + return 0; + default: return -EINVAL; } @@ -5047,6 +5228,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_USER_SPACE_MSR: + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -5054,6 +5239,110 @@ split_irqchip_unlock: return r; } +static void kvm_clear_msr_filter(struct kvm *kvm) +{ + u32 i; + u32 count = kvm->arch.msr_filter.count; + struct msr_bitmap_range ranges[16]; + + mutex_lock(&kvm->lock); + kvm->arch.msr_filter.count = 0; + memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + for (i = 0; i < count; i++) + kfree(ranges[i].bitmap); +} + +static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier to our global pool */ + ranges[kvm->arch.msr_filter.count] = range; + /* Make sure we filled the array before we tell anyone to walk it */ + smp_wmb(); + kvm->arch.msr_filter.count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + bool default_allow; + int r = 0; + bool empty = true; + u32 i; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) + empty &= !filter.ranges[i].nmsrs; + + default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + kvm_clear_msr_filter(kvm); + + kvm->arch.msr_filter.default_allow = default_allow; + + /* + * Protect from concurrent calls to this function that could trigger + * a TOCTOU violation on kvm->arch.msr_filter.count. + */ + mutex_lock(&kvm->lock); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + r = kvm_add_msr_filter(kvm, &filter.ranges[i]); + if (r) + break; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5360,6 +5649,9 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: + r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -5721,6 +6013,9 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + return 1; + if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && @@ -6376,13 +6671,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr(vcpu, msr_index, pdata); + + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6926,7 +7241,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + bool write_fault_to_spt; + + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -6934,6 +7252,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. */ + write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); @@ -7528,9 +7847,9 @@ int kvm_arch_init(void *opaque) goto out_free_x86_fpu_cache; } - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); goto out_free_x86_emulator_cache; } @@ -7563,7 +7882,7 @@ int kvm_arch_init(void *opaque) return 0; out_free_percpu: - free_percpu(shared_msrs); + free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: @@ -7590,7 +7909,7 @@ void kvm_arch_exit(void) #endif kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); - free_percpu(shared_msrs); + free_percpu(user_return_msrs); kmem_cache_destroy(x86_fpu_cache); } @@ -7731,11 +8050,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); kvm_sched_yield(vcpu->kvm, a1); ret = 0; @@ -7746,9 +8070,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; @@ -8379,8 +8709,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } @@ -8487,6 +8817,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -8562,7 +8894,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.request_immediate_exit(vcpu); } - trace_kvm_entry(vcpu->vcpu_id); + trace_kvm_entry(vcpu); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -9576,7 +9908,6 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct msr_data msr; struct kvm *kvm = vcpu->kvm; kvm_hv_vcpu_postcreate(vcpu); @@ -9584,10 +9915,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - msr.data = 0x0; - msr.index = MSR_IA32_TSC; - msr.host_initiated = true; - kvm_write_tsc(vcpu, &msr); + kvm_synchronize_tsc(vcpu, 0); vcpu_put(vcpu); /* poll control enabled by default */ @@ -9624,6 +9952,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } @@ -9721,7 +10050,7 @@ int kvm_arch_hardware_enable(void) u64 max_tsc = 0; bool stable, backwards_tsc = false; - kvm_shared_msr_cpu_online(); + kvm_user_return_msr_cpu_online(); ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -10039,6 +10368,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u32 i; + if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10055,6 +10386,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops.vm_destroy) kvm_x86_ops.vm_destroy(kvm); + for (i = 0; i < kvm->arch.msr_filter.count; i++) + kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -10785,6 +11118,111 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); + +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) +{ + bool pcid_enabled; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + int r; + + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + fallthrough; + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} +EXPORT_SYMBOL_GPL(kvm_handle_invpcid); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 995ab696dcf0..3900ab0c6004 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -372,6 +371,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e); +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); #define KVM_MSR_RET_INVALID 2 diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 2cd902e06062..fa1bc2104b32 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -37,10 +37,19 @@ #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC +#ifdef CONFIG_X86_5LEVEL +#define LOAD_TASK_SIZE_MINUS_N(n) \ + ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ + __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57 +#else +#define LOAD_TASK_SIZE_MINUS_N(n) \ + mov $(TASK_SIZE_MAX - (n)),%_ASM_DX +#endif + .text SYM_FUNC_START(__get_user_1) - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(0) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX @@ -53,15 +62,13 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) - add $1,%_ASM_AX - jc bad_get_user - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(1) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -2: movzwl -1(%_ASM_AX),%edx +2: movzwl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -69,15 +76,13 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) - add $3,%_ASM_AX - jc bad_get_user - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(3) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -3: movl -3(%_ASM_AX),%edx +3: movl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -86,29 +91,25 @@ EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) #ifdef CONFIG_X86_64 - add $7,%_ASM_AX - jc bad_get_user - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(7) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -4: movq -7(%_ASM_AX),%rdx +4: movq (%_ASM_AX),%rdx xor %eax,%eax ASM_CLAC ret #else - add $7,%_ASM_AX - jc bad_get_user_8 - mov PER_CPU_VAR(current_task), %_ASM_DX - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + LOAD_TASK_SIZE_MINUS_N(7) + cmp %_ASM_DX,%_ASM_AX jae bad_get_user_8 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ and %_ASM_DX, %_ASM_AX ASM_STAC -4: movl -7(%_ASM_AX),%edx -5: movl -3(%_ASM_AX),%ecx +4: movl (%_ASM_AX),%edx +5: movl 4(%_ASM_AX),%ecx xor %eax,%eax ASM_CLAC ret diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index b34a17763f28..0ea344c5ea43 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -33,12 +33,19 @@ * as they get called from within inline assembly. */ -#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX +#ifdef CONFIG_X86_5LEVEL +#define LOAD_TASK_SIZE_MINUS_N(n) \ + ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \ + __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57 +#else +#define LOAD_TASK_SIZE_MINUS_N(n) \ + mov $(TASK_SIZE_MAX - (n)),%_ASM_BX +#endif .text SYM_FUNC_START(__put_user_1) - ENTER - cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX + LOAD_TASK_SIZE_MINUS_N(0) + cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) ASM_STAC @@ -51,9 +58,7 @@ EXPORT_SYMBOL(__put_user_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) - ENTER - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX - sub $1,%_ASM_BX + LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) @@ -67,9 +72,7 @@ EXPORT_SYMBOL(__put_user_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) - ENTER - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX - sub $3,%_ASM_BX + LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) @@ -83,9 +86,7 @@ EXPORT_SYMBOL(__put_user_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) - ENTER - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX - sub $7,%_ASM_BX + LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 42606a04ae85..82bf37a5c9ec 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1446,11 +1446,14 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) prefetchw(¤t->mm->mmap_lock); /* - * KVM has two types of events that are, logically, interrupts, but - * are unfortunately delivered using the #PF vector. These events are - * "you just accessed valid memory, but the host doesn't have it right - * now, so I'll put you to sleep if you continue" and "that memory - * you tried to access earlier is available now." + * KVM uses #PF vector to deliver 'page not present' events to guests + * (asynchronous page fault mechanism). The event happens when a + * userspace task is trying to access some valid (from guest's point of + * view) memory which is not currently mapped by the host (e.g. the + * memory is swapped out). Note, the corresponding "page ready" event + * which is injected when the memory becomes available, is delived via + * an interrupt mechanism and not a #PF exception + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 42b6709e6dc7..796506dcfc42 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -221,14 +221,48 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 +/* Number of bytes that will be skipped on tailcall */ +#define X86_TAIL_CALL_OFFSET 11 -#define PROLOGUE_SIZE 25 +static void push_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[0]) + EMIT1(0x53); /* push rbx */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x55); /* push r13 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x56); /* push r14 */ + if (callee_regs_used[3]) + EMIT2(0x41, 0x57); /* push r15 */ + *pprog = prog; +} + +static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[3]) + EMIT2(0x41, 0x5F); /* pop r15 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x5E); /* pop r14 */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x5D); /* pop r13 */ + if (callee_regs_used[0]) + EMIT1(0x5B); /* pop rbx */ + *pprog = prog; +} /* - * Emit x86-64 prologue code for BPF program and check its size. - * bpf_tail_call helper will skip it while jumping into another program + * Emit x86-64 prologue code for BPF program. + * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes + * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, + bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; int cnt = X86_PATCH_SIZE; @@ -238,19 +272,19 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) */ memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); prog += cnt; + if (!ebpf_from_cbpf) { + if (tail_call_reachable && !is_subprog) + EMIT2(0x31, 0xC0); /* xor eax, eax */ + else + EMIT2(0x66, 0x90); /* nop2 */ + } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ - EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); - EMIT1(0x53); /* push rbx */ - EMIT2(0x41, 0x55); /* push r13 */ - EMIT2(0x41, 0x56); /* push r14 */ - EMIT2(0x41, 0x57); /* push r15 */ - if (!ebpf_from_cbpf) { - /* zero init tail_call_cnt */ - EMIT2(0x6a, 0x00); - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); - } + if (stack_depth) + EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); + if (tail_call_reachable) + EMIT1(0x50); /* push rax */ *pprog = prog; } @@ -314,13 +348,14 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, mutex_lock(&text_mutex); if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; + ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { if (text_live) text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); else memcpy(ip, new_insn, X86_PATCH_SIZE); + ret = 0; } - ret = 0; out: mutex_unlock(&text_mutex); return ret; @@ -337,6 +372,22 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } +static int get_pop_bytes(bool *callee_regs_used) +{ + int bytes = 0; + + if (callee_regs_used[3]) + bytes += 2; + if (callee_regs_used[2]) + bytes += 2; + if (callee_regs_used[1]) + bytes += 2; + if (callee_regs_used[0]) + bytes += 1; + + return bytes; +} + /* * Generate the following code: * @@ -351,12 +402,32 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call_indirect(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, + u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; - int label1, label2, label3; + int pop_bytes = 0; + int off1 = 42; + int off2 = 31; + int off3 = 9; int cnt = 0; + /* count the additional bytes used for popping callee regs from stack + * that need to be taken into account for each of the offsets that + * are used for bailing out of the tail call + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + off2 += pop_bytes; + off3 += pop_bytes; + + if (stack_depth) { + off1 += 7; + off2 += 7; + off3 += 7; + } + /* * rdi - pointer to ctx * rsi - pointer to bpf_array @@ -370,72 +441,112 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */ +#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ - label1 = cnt; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) +#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ - label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); /* * if (prog == NULL) * goto out; */ - EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ -#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) + EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ +#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JE, OFFSET3); /* je out */ - label3 = cnt; - /* goto *(prog->bpf_func + prologue_size); */ - EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ - offsetof(struct bpf_prog, bpf_func)); - EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + + EMIT1(0x58); /* pop rax */ + if (stack_depth) + EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */ + round_up(stack_depth, 8)); + /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */ + EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC1, /* add rcx, X86_TAIL_CALL_OFFSET */ + X86_TAIL_CALL_OFFSET); /* - * Wow we're ready to jump into next BPF program + * Now we're ready to jump into next BPF program * rdi == ctx (1st arg) - * rax == prog->bpf_func + prologue_size + * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RAX_BPF_JIT(); + RETPOLINE_RCX_BPF_JIT(); /* out: */ - BUILD_BUG_ON(cnt - label1 != OFFSET1); - BUILD_BUG_ON(cnt - label2 != OFFSET2); - BUILD_BUG_ON(cnt - label3 != OFFSET3); *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image) + u8 **pprog, int addr, u8 *image, + bool *callee_regs_used, u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; + int pop_bytes = 0; + int off1 = 20; + int poke_off; int cnt = 0; + /* count the additional bytes used for popping callee regs to stack + * that need to be taken into account for jump offset that is used for + * bailing out from of the tail call when limit is reached + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + + /* + * total bytes for: + * - nop5/ jmpq $off + * - pop callee regs + * - sub rsp, $val if depth > 0 + * - pop rax + */ + poke_off = X86_PATCH_SIZE + pop_bytes + 1; + if (stack_depth) { + poke_off += 7; + off1 += 7; + } + /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, 14); /* ja out */ + EMIT2(X86_JA, off1); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ - poke->ip = image + (addr - X86_PATCH_SIZE); - poke->adj_off = PROLOGUE_SIZE; + poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->adj_off = X86_TAIL_CALL_OFFSET; + poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; + + emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, + poke->tailcall_bypass); + + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + EMIT1(0x58); /* pop rax */ + if (stack_depth) + EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; @@ -453,7 +564,7 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; - WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -464,18 +575,25 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) if (target) { /* Plain memcpy is used when image is not live yet * and still not locked as read-only. Once poke - * location is active (poke->ip_stable), any parallel - * bpf_arch_text_poke() might occur still on the - * read-write image until we finally locked it as - * read-only. Both modifications on the given image - * are under text_mutex to avoid interference. + * location is active (poke->tailcall_target_stable), + * any parallel bpf_arch_text_poke() might occur + * still on the read-write image until we finally + * locked it as read-only. Both modifications on + * the given image are under text_mutex to avoid + * interference. */ - ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + poke->adj_off, false); BUG_ON(ret < 0); + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + (u8 *)poke->tailcall_target + + X86_PATCH_SIZE, NULL, false); + BUG_ON(ret < 0); } - WRITE_ONCE(poke->ip_stable, true); + WRITE_ONCE(poke->tailcall_target_stable, true); mutex_unlock(&array->aux->poke_mutex); } } @@ -652,19 +770,49 @@ static bool ex_handler_bpf(const struct exception_table_entry *x, return true; } +static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, + bool *regs_used, bool *tail_call_seen) +{ + int i; + + for (i = 1; i <= insn_cnt; i++, insn++) { + if (insn->code == (BPF_JMP | BPF_TAIL_CALL)) + *tail_call_seen = true; + if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6) + regs_used[0] = true; + if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7) + regs_used[1] = true; + if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8) + regs_used[2] = true; + if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9) + regs_used[3] = true; + } +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { + bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; + bool callee_regs_used[4] = {}; int insn_cnt = bpf_prog->len; + bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + detect_reg_usage(insn, insn_cnt, callee_regs_used, + &tail_call_seen); + + /* tail call's presence in current prog implies it is reachable */ + tail_call_reachable |= tail_call_seen; + emit_prologue(&prog, bpf_prog->aux->stack_depth, - bpf_prog_was_classic(bpf_prog)); + bpf_prog_was_classic(bpf_prog), tail_call_reachable, + bpf_prog->aux->func_idx != 0); + push_callee_regs(&prog, callee_regs_used); addrs[0] = prog - temp; for (i = 1; i <= insn_cnt; i++, insn++) { @@ -1102,16 +1250,27 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) - return -EINVAL; + if (tail_call_reachable) { + EMIT3_off32(0x48, 0x8B, 0x85, + -(bpf_prog->aux->stack_depth + 8)); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + return -EINVAL; + } else { + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + return -EINVAL; + } break; case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image); + &prog, addrs[i], image, + callee_regs_used, + bpf_prog->aux->stack_depth); else - emit_bpf_tail_call_indirect(&prog); + emit_bpf_tail_call_indirect(&prog, + callee_regs_used, + bpf_prog->aux->stack_depth); break; /* cond jump */ @@ -1294,12 +1453,7 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - if (!bpf_prog_was_classic(bpf_prog)) - EMIT1(0x5B); /* get rid of tail_call_cnt */ - EMIT2(0x41, 0x5F); /* pop r15 */ - EMIT2(0x41, 0x5E); /* pop r14 */ - EMIT2(0x41, 0x5D); /* pop r13 */ - EMIT1(0x5B); /* pop rbx */ + pop_callee_regs(&prog, callee_regs_used); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; @@ -1379,10 +1533,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + return -EINVAL; + } else { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + } /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1402,13 +1561,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + return -EINVAL; + } else { + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, + (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } *pprog = prog; return 0; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b8c9a5b87f37..0a0e168be1cb 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -587,7 +587,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) { dev_info(&dev->dev, "PME# does not work under D3, disabling it\n"); - dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold) + dev->pme_support &= ~((PCI_PM_CAP_PME_D3hot | PCI_PM_CAP_PME_D3cold) >> PCI_PM_CAP_PME_SHIFT); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 00c62115f39c..24ca4ee2802f 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -33,6 +33,7 @@ #include <asm/hw_irq.h> #include <asm/io_apic.h> #include <asm/intel-mid.h> +#include <asm/acpi.h> #define PCIE_CAP_OFFSET 0x100 @@ -322,7 +323,7 @@ static void pci_d3delay_fixup(struct pci_dev *dev) */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; - dev->d3_delay = 0; + dev->d3hot_delay = 0; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index c313d784efab..5701d5ba3df4 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -15,7 +15,6 @@ #include <asm/iommu.h> #define STA2X11_SWIOTLB_SIZE (4*1024*1024) -extern int swiotlb_late_init_with_default_size(size_t default_size); /* * We build a list of bus numbers that are under the ConneXt. The @@ -133,7 +132,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev) struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); struct device *dev = &pdev->dev; u32 amba_base, max_amba_addr; - int i; + int i, ret; if (!instance) return; @@ -141,7 +140,9 @@ static void sta2x11_map_ep(struct pci_dev *pdev) pci_read_config_dword(pdev, AHB_BASE(0), &amba_base); max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1; - dev->dma_pfn_offset = PFN_DOWN(-amba_base); + ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE); + if (ret) + dev_err(dev, "sta2x11: could not set DMA offset\n"); dev->bus_dma_limit = max_amba_addr; pci_set_consistent_dma_mask(pdev, max_amba_addr); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 09a085bde0d4..1401899dee9b 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -52,14 +52,6 @@ static const int reg_offsets[] = int putreg(struct task_struct *child, int regno, unsigned long value) { -#ifdef TIF_IA32 - /* - * Some code in the 64bit emulation may not be 64bit clean. - * Don't take any chances. - */ - if (test_tsk_thread_flag(child, TIF_IA32)) - value &= 0xffffffff; -#endif switch (regno) { case R8: case R9: @@ -137,10 +129,7 @@ int poke_user(struct task_struct *child, long addr, long data) unsigned long getreg(struct task_struct *child, int regno) { unsigned long mask = ~0UL; -#ifdef TIF_IA32 - if (test_tsk_thread_flag(child, TIF_IA32)) - mask = 0xffffffff; -#endif + switch (regno) { case R8: case R9: diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index c51dd8363d25..bae61554abcc 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -2,7 +2,7 @@ #include <stdio.h> #include <stddef.h> #include <signal.h> -#include <sys/poll.h> +#include <poll.h> #include <sys/mman.h> #include <sys/user.h> #define __FRAME_OFFSETS diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 4988e19598c8..1e681bf62561 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -25,6 +25,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; + int idx; } gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, @@ -90,19 +91,31 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) } } +static int gnttab_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct gnttab_vm_area *area = data; + + area->ptes[area->idx++] = pte; + return 0; +} + static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) { area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); if (area->ptes == NULL) return -ENOMEM; - - area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); - if (area->area == NULL) { - kfree(area->ptes); - return -ENOMEM; - } - + area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP); + if (!area->area) + goto out_free_ptes; + if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr, + PAGE_SIZE * nr_frames, gnttab_apply, area)) + goto out_free_vm_area; return 0; +out_free_vm_area: + free_vm_area(area->area); +out_free_ptes: + kfree(area->ptes); + return -ENOMEM; } static void arch_gnttab_vfree(struct gnttab_vm_area *area) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 33293ce01d8d..19ae3e4fe4e9 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -2,7 +2,7 @@ /* Glue code to lib/swiotlb-xen.c */ -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/pci.h> #include <xen/swiotlb-xen.h> diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index d8a29dc5a284..d0dfa50bd0bb 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -41,6 +41,7 @@ config XTENSA select IRQ_DOMAIN select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC + select SET_FS select VIRT_TO_BUS help Xtensa processors are 32-bit RISC machines designed by Tensilica @@ -523,7 +524,7 @@ config MEMMAP_CACHEATTR 2: cache bypass, 4: WB cached, f: illegal. - For ful MMU: + For full MMU: bit 0: executable, bit 1: writable, bits 2..3: diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 17c4384f8495..94955caa4488 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -11,8 +11,7 @@ * Joe Taylor <joe@tensilica.com, joetylr@yahoo.com> */ -#include <linux/dma-contiguous.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/dma-direct.h> #include <linux/gfp.h> #include <linux/highmem.h> diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index b3b17d6c50f0..1fb1047f905c 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -501,6 +501,6 @@ void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6276e3c2d3fc..b070f272995d 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -410,3 +410,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index ad9d59d93f39..c6fc83efee0c 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -26,7 +26,7 @@ #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/of_fdt.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <asm/bootparam.h> #include <asm/page.h> |